Installs and Imports¶

In [8]:
import re
import string
import subprocess
import sys
import warnings

warnings.filterwarnings('ignore')

REQS = [
    ('pip', 'pip==24.2'),
    ('lightgbm', 'lightgbm==4.5.0'),
    ('matplotlib', 'matplotlib==3.9.2'),
    ('mlxtend', 'mlxtend==0.23.1'),
    ('nltk', 'nltk==3.9.1'),
    ('numpy', 'numpy==2.0.2'),
    ('optuna', 'optuna==4.0.0'),
    ('pandas', 'pandas==2.2.2'),
    ('seaborn', 'seaborn==0.13.2'),
    ('sklearn', 'scikit-learn==1.5.2'),
    ('statsmodels', 'statsmodels==0.14.3'),
    ('umap-learn', 'umap-learn==0.5.6'),
    ('xgboost', 'xgboost==2.1.1'),
]

try:
    subprocess.check_call([sys.executable, '-m', 'ensurepip'])
except Exception as e:
    print(e, file=sys.stderr)


def ensure_installed(module_info):
    _, install_str = module_info
    try:
        subprocess.check_call([sys.executable, '-m',
                               'pip', 'install', '--quiet',
                               install_str])
        print(f'Installed "{install_str}".')
    except Exception as e:
        print(e, file=sys.stderr)


for m in REQS:
    ensure_installed(m)

# Standard libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# Machine learning and data processing
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    calinski_harabasz_score,
    classification_report,
    confusion_matrix,
    mean_squared_error,
    silhouette_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Statistical modeling
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Dimensionality reduction
import umap

# Hyperparameter optimization
import optuna

# Other machine learning libraries
import lightgbm as lgb
from xgboost import XGBClassifier
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


def find_columns_with_missing(data, columns):
    """Finding features that have a lot of missing data"""
    print()
    print('Finding columns with missing data...')
    data_cleaned = data
    missing = []
    i = 0
    for col in columns:
        missing.append(data[col].isnull().sum())
        if missing[i] > 0:
            print()
            print(f'Column {col} is missing {missing[i]} values.')
            print(f'Proportion of missing data is {missing[i]/len(data)}.')
            if missing[i]/len(data) >= 0.9:
                print(f'Dropping column {col}...')
                data_cleaned = data_cleaned.drop(columns=col)
        i += 1
    return missing, data_cleaned


def hex_to_rgb(hex_color):
    """Function to convert hex to RGB"""
    # Remove the '#' if it exists
    hex_color = hex_color.lstrip('#')

    # Convert hex to integer and split into RGB components
    return [int(hex_color[i:i+2], 16) for i in (0, 2, 4)]


def preprocess_text(text):
    """Preprocessing function"""
    text = text.lower()
    # Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removes punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    return ' '.join(tokens)


def plot_silhouette_bar_across_experiments(model_names, silhouette_scores):
    n_experiments = len(silhouette_scores)
    n_models = len(model_names)
    bar_width = 0.2
    index = np.arange(n_experiments)
    plt.figure(figsize=(12, 6))

    for i, model_name in enumerate(model_names):
        sil_scores = [exp_scores[i] for exp_scores in silhouette_scores]
        plt.bar(index + i * bar_width,sil_scores, bar_width, label=model_name)

    plt.xlabel('Experiments')
    plt.ylabel('Silhouette scores')
    plt.title('Silhouette scores Across Models and Experiments')
    plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
    plt.legend()
    plt.tight_layout()
    plt.show()


def visualize_ch_index_across_experiments(model_names, ch_scores):

    n_experiments = len(ch_scores)
    n_models = len(model_names)
    bar_width = 0.2
    index = np.arange(n_experiments)
    plt.figure(figsize=(12, 6))

    for i, model_name in enumerate(model_names):
        ch_score = [exp_scores[i] for exp_scores in ch_scores]
        plt.bar(index + i * bar_width, ch_score, bar_width, label=model_name)

    plt.xlabel('Experiments')
    plt.ylabel('Calinski-Harabasz Index')
    plt.title('Calinski-Harabasz Index Across Models and Experiments')
    plt.xticks(index + bar_width * (n_models - 1) / 2, [f'Exp {i+1}' for i in range(n_experiments)])
    plt.legend()
    plt.tight_layout()
    plt.show()


class KMeansClustering:
    def __init__(self, data):
        self.data = data
        self.best_params = None
        self.kmeans_model = None

    def tune_hyperparameters(self, n_trials=15):
        def objective_kmeans(trial):
            n_clusters = trial.suggest_int('n_clusters', 2, 10)
            init_method = trial.suggest_categorical('init', ['k-means++', 'random'])

            kmeans = KMeans(n_clusters=n_clusters, init=init_method, random_state=42)
            kmeans.fit(self.data)
            labels = kmeans.labels_
            score = silhouette_score(self.data, labels)
            return score

        study = optuna.create_study(direction="maximize")
        study.optimize(objective_kmeans, n_trials=n_trials)
        self.best_params = study.best_params
        print("Best params:", self.best_params)

    def fit_model(self):
        self.kmeans_model = KMeans(n_clusters=self.best_params['n_clusters'],
                                   init=self.best_params['init'],
                                   random_state=42)
        self.kmeans_model.fit(self.data)

    def visualize_clusters(self, umap_embedding, feature):
        labels = self.kmeans_model.labels_
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111, projection='3d')
        # Scatter plot in 3D
        scatter = ax.scatter(
            umap_embedding[:, 0],
            umap_embedding[:, 1],
            umap_embedding[:, 2],
            c=labels,
            cmap='viridis',
            s=30
        )
        # Add labels and title
        ax.set_xlabel('UMAP Dimension 1')
        ax.set_ylabel('UMAP Dimension 2')
        ax.set_zlabel('UMAP Dimension 3')
        plt.title(f'3D UMAP of K-Means Clusters on {feature}')
        # Add a color bar for better visual distinction of clusters
        plt.colorbar(scatter)
        # Show the plot
        plt.show()

    def plot_elbow_method(self, k_range=(2, 10)):
        """
        Plot the Elbow Method for choosing the optimal number of clusters
        Args:
        - k_range: tuple, range of cluster numbers to evaluate
        """
        inertia = []
        K = range(k_range[0], k_range[1] + 1)
        for k in K:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(self.data)
            inertia.append(kmeans.inertia_)  # Sum of squared distances to closest cluster center

        plt.figure(figsize=(8, 6))
        plt.plot(K, inertia, 'bo-', markersize=8)
        plt.title('Elbow Method for Optimal K')
        plt.xlabel('Number of clusters')
        plt.ylabel('Inertia (Sum of squared distances)')
        plt.grid(True)
        plt.show()

    def output_label(self):
        return self.kmeans_model.labels_

    def silhoutte(self):
        score = silhouette_score(self.data, self.kmeans_model.labels_)
        print(f'The Silhouette score is {score}')
        return score

    def calinski(self):
        if len(np.unique(self.kmeans_model.labels_)) > 1:  # Only calculate if there are clusters
            score = calinski_harabasz_score(self.data, self.kmeans_model.labels_)
        else:
            score = np.nan  # If only one cluster (or all noise), set to NaN
        print(f'The Callinski index is {score}')
        return score


class DBSCANClustering:
    def __init__(self, data):
        self.data = data
        self.best_params = None
        self.dbscan_model = None

    def tune_hyperparameters(self, n_trials=15):
        def objective_dbscan(trial):
            eps = trial.suggest_float('eps', 0.1, 2.0)
            min_samples = trial.suggest_int('min_samples', 3, 20)

            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            dbscan.fit(self.data)
            labels = dbscan.labels_
            if len(set(labels)) > 1:
                score = silhouette_score(self.data, labels)
            else:
                score = -1
            return score

        study = optuna.create_study(direction="maximize")
        study.optimize(objective_dbscan, n_trials=n_trials)
        self.best_params = study.best_params
        print("Found best params:", self.best_params)

    def fit_model(self):
        self.dbscan_model = DBSCAN(eps=self.best_params['eps'], min_samples=self.best_params['min_samples'])
        self.dbscan_model.fit(self.data)

    def visualize_clusters_and_outliers_3D(self, umap_embedding, feature):
        labels = self.dbscan_model.labels_

        # Separate clustered points and noise points
        clustered_points = umap_embedding[labels >= 0]  # Points part of a cluster
        clustered_labels = labels[labels >= 0]
        outliers = umap_embedding[labels == -1]  # Noise points

        # Create a 3D plot
        fig = plt.figure(figsize=(10, 7))
        ax = fig.add_subplot(111, projection='3d')

        # Plot the clustered points in different colors
        scatter = ax.scatter(clustered_points[:, 0], clustered_points[:, 1], clustered_points[:, 2],
                             c=clustered_labels, cmap='viridis', s=30)

        # Plot the outliers (noise points) in red with 'x' markers
        ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 2], c='red', marker='x', s=80, label='Outliers')

        # Add labels and title
        ax.set_xlabel('UMAP Dimension 1')
        ax.set_ylabel('UMAP Dimension 2')
        ax.set_zlabel('UMAP Dimension 3')
        ax.set_title(f'DBSCAN 3D Clusters with Outliers on {feature}')
        # Add a legend and color bar for clusters
        plt.legend()
        plt.colorbar(scatter, ax=ax)
        plt.show()

    def output_label(self):
        return self.dbscan_model.labels_

    def silhoutte(self):
        score = silhouette_score(self.data, self.dbscan_model.labels_)
        print(f'The Silhouette score is {score}')
        return score

    def calinski(self):
        if len(np.unique(self.dbscan_model.labels_)) > 1:  # Only calculate if there are clusters
            score = calinski_harabasz_score(self.data, self.dbscan_model.labels_)
        else:
            score = np.nan  # If only one cluster (or all noise), set to NaN
        print(f'The Callinski index is {score}')
        return score


class ClusteringDataRetriever:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def get_data_with_labels(self):
        # If Data is in a numpy array, convert it to a pandas DataFrame
        if isinstance(self.data, np.ndarray):
            df = pd.DataFrame(self.data)
        else:
            df = self.data.copy()  # If already a DataFrame

        # Add a new column for the cluster labels
        df['Cluster_Label'] = self.labels

        return df[['gender', 'gender:confidence', 'Cluster_Label']]

    def get_cluster_data(self, cluster_label):
        # Retrieve data points belonging to a specific cluster.
        df = self.get_data_with_labels()
        return df[df['Cluster_Label'] == cluster_label]

    def get_noise_data(self):
        # Retrieve Records classified as noise (-1 label) in DBSCAN.
        return self.get_cluster_data(-1)
Installed "pip==24.2".
Installed "lightgbm==4.5.0".
Installed "matplotlib==3.9.2".
Installed "mlxtend==0.23.1".
Installed "nltk==3.9.1".
Installed "numpy==2.0.2".
Installed "optuna==4.0.0".
Installed "pandas==2.2.2".
Installed "seaborn==0.13.2".
Installed "scikit-learn==1.5.2".
Installed "statsmodels==0.14.3".
Installed "umap-learn==0.5.6".
Installed "xgboost==2.1.1".

EDA¶

In [9]:
# Main starts here
# Load the dataset
df = pd.read_csv('twitter_user_data.csv', encoding='ISO-8859-1')

# Quick view of the dataset
print()
print('Dataset Overview')
print(df.info())
print(df.head())

all_features = df.columns

missing_col, df_cleaned = find_columns_with_missing(df, all_features)

# Dropping rows where 'gender' is missing
df_cleaned = df_cleaned.dropna(subset=['gender'])

# Drop the 'profile_yn' column since it is not relevant to human/non-human classification
df_cleaned = df_cleaned.drop(columns=['profile_yn'])

# Now that we have handled the missing data, you can proceed with further analysis
print()
print('Dataset Overview')
print(df_cleaned.info())
print(df_cleaned.head())

print()
print('---- EXPLORATORY DATA ANALYSIS (EDA) ----')

current_num_features = df.select_dtypes(include=[np.number])

# Plot distribution of each numerical feature with gender as hue using seaborn
for feature in current_num_features:
    plt.figure(figsize=(8, 6))
    sns.histplot(df_cleaned, x=feature, hue='gender', bins=30, kde=True)
    plt.title(f'Distribution of {feature} by Gender')
    plt.show()

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_cleaned)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()

# Plot distribution of 'tweet_count' and 'retweet_count'
for column in ['tweet_count', 'retweet_count']:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df_cleaned, x=column, kde=True, bins=30)
    plt.title(f'Distribution of {column.replace("_", " ").capitalize()}')
    plt.show()

# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned[['tweet_count', 'retweet_count', 'fav_number']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Extracting date from 'created' and 'tweet_created' for time-based analysis
df_cleaned['profile_created_year'] = pd.to_datetime(df_cleaned['created']).dt.year
df_cleaned['tweet_created_year'] = pd.to_datetime(df_cleaned['tweet_created']).dt.year

# Ensure 'created' and tweet_created are in datetime format
df_cleaned['created'] = pd.to_datetime(df_cleaned['created'], errors='coerce')
df_cleaned['tweet_created'] = pd.to_datetime(df_cleaned['tweet_created'], errors='coerce')

# assuming Data was up-to-date
df_cleaned['account_age'] = (pd.Timestamp.now() - df_cleaned['created']).dt.days

df_cleaned['tweets_per_day'] = df_cleaned['tweet_count'] / df_cleaned['account_age']
df_cleaned['retweets_per_day'] = df_cleaned['retweet_count'] / df_cleaned['account_age']
df_cleaned['favorites_per_day'] = df_cleaned['fav_number'] / df_cleaned['account_age']

# Plotting the distribution of profile creation over the years
plt.figure(figsize=(8, 6))
sns.histplot(df_cleaned['profile_created_year'], kde=False, bins=15)
plt.title('Distribution of Profile Creation Years')
plt.xlabel('Profile Created Year')
plt.ylabel('count')
plt.show()

# Plotting the histogram of tweets per day
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['tweets_per_day'], bins=50, kde=True)
plt.title('Distribution of Tweets Per Day')
plt.xlabel('Tweets Per Day')
plt.ylabel('Frequency')
plt.show()

# show the relationship between account age and tweets per day
plt.figure(figsize=(10, 6))
sns.scatterplot(x='account_age', y='tweets_per_day', data=df_cleaned)
plt.title('Account Age vs. Tweets Per Day')
plt.xlabel('Account Age (Days)')
plt.ylabel('Tweets Per Day')
plt.show()

# Exploring 'link_color' and 'sidebar_color' features

# Check number of NaN value in  'link_color' and 'sidebar_color' features
link_color_nan_count = df_cleaned['link_color'].isnull().sum()
sidebar_color_nan_count = df_cleaned['sidebar_color'].isnull().sum()

print()
print(f"Number of NaN values in 'link_color': {link_color_nan_count}.")
print(f"Number of NaN values in 'sidebar_color': {sidebar_color_nan_count}.")

# Check how many available colors in 'link_color' and 'sidebar_color' features
link_color_count = len(df_cleaned['link_color'].unique())
sidebar_color_count = len(df_cleaned['sidebar_color'].unique())
print(f'Number of link color is {link_color_count}.')
print(f'Number of side bar color is {sidebar_color_count}.')

# Apply the function to 'link_color' and 'sidebar_color'
df_cleaned['link_color'] = df_cleaned['link_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')
df_cleaned['sidebar_color'] = df_cleaned['sidebar_color'].apply(lambda x: f'#{x}' if len(x) == 6 else '#000000')

# Drop rows where 'sidebar_color' is still NaN
df_cleaned = df_cleaned.dropna(subset=['link_color'])
df_cleaned = df_cleaned.dropna(subset=['sidebar_color'])
print(f"Number of NaN values in 'link_color': {df_cleaned['link_color'].isnull().sum()}")
print(f"Number of NaN values in 'sidebar_color': {df_cleaned['sidebar_color'].isnull().sum()}")

# top 15 colors
top_sidebar_colors = df_cleaned['sidebar_color'].value_counts().iloc[:15].index.tolist()
top_link_colors = df_cleaned['link_color'].value_counts().iloc[:15].index.tolist()
# print(top_sidebar_colors)

# Extract top 10 most common sidebar colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='sidebar_color', data=df_cleaned, order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index, palette=top_sidebar_colors)
plt.title('Top 15 Most Common Profile sidebar_color')
plt.ylabel('Sidebar Color')
plt.xlabel('count')
plt.grid()
plt.show()

# Extract top 10 most common link colors
sns.set(rc={'axes.facecolor':'lightgrey', 'figure.facecolor':'white'})
plt.figure(figsize=(8, 6))
sns.countplot(y='link_color', data=df_cleaned, order=df_cleaned['link_color'].value_counts().iloc[:15].index, palette=top_link_colors)
plt.title('Top 15 Most Common Profile link_color')
plt.ylabel('Link Color')
plt.xlabel('count')
plt.grid()
plt.show()

# count plot for sidebar_color vs. gender
plt.figure(figsize=(10, 6))
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.countplot(x='sidebar_color', hue='gender', data=df_cleaned,
              order=df_cleaned['sidebar_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common Sidebar Colors by Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

# count plot for link_color vs. gender
plt.figure(figsize=(10, 6))
sns.countplot(x='link_color', hue='gender', data=df_cleaned,
              order=df_cleaned['link_color'].value_counts().iloc[:15].index)
plt.title('Top 15 Most Common link Colors by Gender')
plt.xlabel('Link Color')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

# Scatter plot for link_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='link_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['link_color'].isin(top_link_colors)],
                palette='Set2', s=100, alpha=0.7)
plt.title('Link Colors vs. Tweet count with Gender')
plt.xlabel('Link Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()

# Scatter plot for sidebar_color vs. tweet_count with gender as hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sidebar_color', y='tweet_count', hue='gender', data=df_cleaned[df_cleaned['sidebar_color'].isin(top_sidebar_colors)],
                palette='Set2', s=100, alpha=0.7)
plt.title('Sidebar Colors vs. Tweet count with Gender')
plt.xlabel('Sidebar Color')
plt.ylabel('Tweet count')
plt.xticks(rotation=45)
plt.show()

# Select columns to be used
col = ['gender', 'gender:confidence', 'description', 'favorites_per_day','link_color',
       'retweets_per_day', 'sidebar_color', 'text', 'tweets_per_day','user_timezone', 'tweet_location', 'profile_created_year', 'tweet_created_year'
       ]
df_preprocessed = df_cleaned[col].copy()
# Remove rows where gender is 'Unknown'
df_preprocessed = df_preprocessed[df_preprocessed['gender'] != 'unknown']

# Plot correlation matrix
corr_matrix = df_preprocessed.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

# Drop one feature from highly correlated pairs (correlation > 0.9)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_preprocessed = df_preprocessed.drop(columns=to_drop)

# Filling missing values for important features
df_preprocessed['user_timezone'].fillna('Unknown', inplace=True)
df_preprocessed['tweet_location'].fillna('Unknown', inplace=True)
categorical_features = ['user_timezone', 'tweet_location']

# categorise types of features

# numerical features
df_num = df_preprocessed[['retweets_per_day', 'favorites_per_day', 'tweets_per_day', 'profile_created_year', 'tweet_created_year']].copy()

# categorical features with frequency encoding
freq_encoding_location = df_preprocessed['tweet_location'].value_counts(normalize=True)
df_preprocessed['tweet_location_encoded'] = df_preprocessed['tweet_location'].map(freq_encoding_location)

freq_encoding_timezone = df_preprocessed['user_timezone'].value_counts(normalize=True)
df_preprocessed['user_timezone_encoded'] = df_preprocessed['user_timezone'].map(freq_encoding_timezone)

# gender features
# encode the 'gender' column to numeric values
df_preprocessed['gender'] = df_preprocessed['gender'].replace({'male': 0, 'female': 1, 'brand': 2})

# Check for unique values in the 'gender' column after replacement
print()
print("Unique Values in 'gender'")
print(df_preprocessed['gender'].unique())
print(df_preprocessed.info())

# Distribution of gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_preprocessed)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('count')
plt.show()

df_gender = df_preprocessed[['gender', 'gender:confidence']].copy()

# Drop the original categorical columns
df_preprocessed = df_preprocessed.drop(columns=categorical_features)

# Convert 'link_color' values
df_preprocessed['link_color_rgb'] = df_preprocessed['link_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))
# Convert 'sidebar_color' values
df_preprocessed['sidebar_color_rgb'] = df_preprocessed['sidebar_color'].apply(lambda x: hex_to_rgb(x) if isinstance(x, str) else (0,0,0))

rgb_df = pd.DataFrame(df_preprocessed['link_color_rgb'].to_list(), columns=['link_R', 'link_G', 'link_B'])
rgb_df = pd.concat([rgb_df, pd.DataFrame(df_preprocessed['sidebar_color_rgb'].to_list(), columns=['sidebar_R', 'sidebar_G', 'sidebar_B'])], axis=1)

# Drop the original color features
df_preprocessed = df_preprocessed.drop(columns=['link_color', 'sidebar_color', 'link_color_rgb', 'sidebar_color_rgb'])

# Check if all required features are there
print()
print('All Remaining Features')
print(df_preprocessed.columns.tolist())

# Define the numerical features to scale (filtering for int64 and float64 columns)
numerical_features = df_preprocessed.select_dtypes(include=[np.number])
# print(f'All current numerical features are {numerical_features.columns.tolist()}')

print()
print('Dataset Overview After PreProcessing')
print(df_preprocessed.info())

print()
print('---- NLP Processing ----')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

df_preprocessed['description'].fillna('', inplace=True)
df_preprocessed['text'].fillna('', inplace=True)
# df_preprocessed['name'].fillna('', inplace=True)

# Check the text features if they still contain NaN
print()
print(df_preprocessed.select_dtypes(include=[object]))

# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Apply preprocessing to the 'description', 'text', and 'name' columns
df_preprocessed['cleaned_description'] = df_preprocessed['description'].apply(lambda x: preprocess_text(str(x)))
df_preprocessed['cleaned_text'] = df_preprocessed['text'].apply(lambda x: preprocess_text(str(x)))
# df_preprocessed['cleaned_name'] = df_preprocessed['name'].apply(lambda x: preprocess_text(str(x)))

# Check the preprocessed data with preprocessed text features
print(df_preprocessed[['description', 'cleaned_description', 'text', 'cleaned_text']].head())

# Drop the original text features
df_preprocessed = df_preprocessed.drop(columns=['description','text'])

# Initialize TFIDF vectorizer for text features
print()
print('Applying TF-IDF Vectorisation...')
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')

# Apply TF-IDF on 'description', 'text', 'name' columns

tfidf_description = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_description']).toarray()
tfidf_text = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_text']).toarray()
# tfidf_name = tfidf_vectorizer.fit_transform(df_preprocessed['cleaned_name']).toarray()

# Convert TF-IDF into DataFrames and add to df_preprocessed
tfidf_desc_df = pd.DataFrame(tfidf_description, columns=[f'desc_{i}' for i in range(tfidf_description.shape[1])])
tfidf_text_df = pd.DataFrame(tfidf_text, columns=[f'text_{i}' for i in range(tfidf_text.shape[1])])
# tfidf_name_df = pd.DataFrame(tfidf_name, columns=[f'name_{i}' for i in range(tfidf_name.shape[1])])

# Merge with main dataframe
df_preprocessed = pd.concat([df_preprocessed.reset_index(drop=True), tfidf_desc_df, tfidf_text_df], axis=1)

# Drop the cleaned text features
df_preprocessed = df_preprocessed.drop(columns=['cleaned_description', 'cleaned_text'])

df_preprocessed = pd.concat([df_preprocessed, rgb_df], axis=1)

df_asso = df_preprocessed.copy()

df_cate = df_preprocessed[['tweet_location_encoded', 'user_timezone_encoded']].copy()
Dataset Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  profile_yn_gold        50 non-null     object 
 16  profileimage           20050 non-null  object 
 17  retweet_count          20050 non-null  int64  
 18  sidebar_color          20050 non-null  object 
 19  text                   20050 non-null  object 
 20  tweet_coord            159 non-null    object 
 21  tweet_count            20050 non-null  int64  
 22  tweet_created          20050 non-null  object 
 23  tweet_id               20050 non-null  float64
 24  tweet_location         12565 non-null  object 
 25  user_timezone          12252 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(17)
memory usage: 3.8+ MB
None
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence profile_yn  profile_yn:confidence  \
0    male             1.0000        yes                    1.0   
1    male             1.0000        yes                    1.0   
2    male             0.6625        yes                    1.0   
3    male             1.0000        yes                    1.0   
4  female             1.0000        yes                    1.0   

          created  ...                                       profileimage  \
0    12/5/13 1:48  ...  https://pbs.twimg.com/profile_images/414342229...   
1   10/1/12 13:51  ...  https://pbs.twimg.com/profile_images/539604221...   
2  11/28/14 11:30  ...  https://pbs.twimg.com/profile_images/657330418...   
3   6/11/09 22:39  ...  https://pbs.twimg.com/profile_images/259703936...   
4   4/16/14 13:23  ...  https://pbs.twimg.com/profile_images/564094871...   

   retweet_count sidebar_color  \
0              0        FFFFFF   
1              0        C0DEED   
2              1        C0DEED   
3              0        C0DEED   
4              0             0   

                                                text tweet_coord tweet_count  \
0  Robbie E Responds To Critics After Win Against...         NaN      110964   
1  ‰ÛÏIt felt like they were my friends and I was...         NaN        7471   
2  i absolutely adore when louis starts the songs...         NaN        5617   
3  Hi @JordanSpieth - Looking at the url - do you...         NaN        1693   
4  Watching Neighbours on Sky+ catching up with t...         NaN       31462   

    tweet_created      tweet_id   tweet_location               user_timezone  
0  10/26/15 12:40  6.587300e+17  main; @Kan1shk3                     Chennai  
1  10/26/15 12:40  6.587300e+17              NaN  Eastern Time (US & Canada)  
2  10/26/15 12:40  6.587300e+17           clcncl                    Belgrade  
3  10/26/15 12:40  6.587300e+17    Palo Alto, CA  Pacific Time (US & Canada)  
4  10/26/15 12:40  6.587300e+17              NaN                         NaN  

[5 rows x 26 columns]

Finding columns with missing data...

Column _last_judgment_at is missing 50 values.
Proportion of missing data is 0.0024937655860349127.

Column gender is missing 97 values.
Proportion of missing data is 0.00483790523690773.

Column gender:confidence is missing 26 values.
Proportion of missing data is 0.0012967581047381546.

Column description is missing 3744 values.
Proportion of missing data is 0.18673316708229426.

Column gender_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column gender_gold...

Column profile_yn_gold is missing 20000 values.
Proportion of missing data is 0.9975062344139651.
Dropping column profile_yn_gold...

Column tweet_coord is missing 19891 values.
Proportion of missing data is 0.992069825436409.
Dropping column tweet_coord...

Column tweet_location is missing 7485 values.
Proportion of missing data is 0.3733167082294264.

Column user_timezone is missing 7798 values.
Proportion of missing data is 0.388927680798005.

Dataset Overview
<class 'pandas.core.frame.DataFrame'>
Index: 19953 entries, 0 to 20049
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               19953 non-null  int64  
 1   _golden                19953 non-null  bool   
 2   _unit_state            19953 non-null  object 
 3   _trusted_judgments     19953 non-null  int64  
 4   _last_judgment_at      19903 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      19953 non-null  float64
 7   profile_yn:confidence  19953 non-null  float64
 8   created                19953 non-null  object 
 9   description            16224 non-null  object 
 10  fav_number             19953 non-null  int64  
 11  link_color             19953 non-null  object 
 12  name                   19953 non-null  object 
 13  profileimage           19953 non-null  object 
 14  retweet_count          19953 non-null  int64  
 15  sidebar_color          19953 non-null  object 
 16  text                   19953 non-null  object 
 17  tweet_count            19953 non-null  int64  
 18  tweet_created          19953 non-null  object 
 19  tweet_id               19953 non-null  float64
 20  tweet_location         12510 non-null  object 
 21  user_timezone          12185 non-null  object 
dtypes: bool(1), float64(3), int64(5), object(13)
memory usage: 3.4+ MB
None
    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  815719226    False   finalized                   3    10/26/15 23:24   
1  815719227    False   finalized                   3    10/26/15 23:30   
2  815719228    False   finalized                   3    10/26/15 23:33   
3  815719229    False   finalized                   3    10/26/15 23:10   
4  815719230    False   finalized                   3     10/27/15 1:15   

   gender  gender:confidence  profile_yn:confidence         created  \
0    male             1.0000                    1.0    12/5/13 1:48   
1    male             1.0000                    1.0   10/1/12 13:51   
2    male             0.6625                    1.0  11/28/14 11:30   
3    male             1.0000                    1.0   6/11/09 22:39   
4  female             1.0000                    1.0   4/16/14 13:23   

                                         description  ...            name  \
0                              i sing my own rhythm.  ...         sheezy0   
1  I'm the author of novels filled with family dr...  ...     DavdBurnett   
2                louis whining and squealing and all  ...  lwtprettylaugh   
3  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...  ...     douggarland   
4  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...  ...    WilfordGemma   

                                        profileimage retweet_count  \
0  https://pbs.twimg.com/profile_images/414342229...             0   
1  https://pbs.twimg.com/profile_images/539604221...             0   
2  https://pbs.twimg.com/profile_images/657330418...             1   
3  https://pbs.twimg.com/profile_images/259703936...             0   
4  https://pbs.twimg.com/profile_images/564094871...             0   

  sidebar_color                                               text  \
0        FFFFFF  Robbie E Responds To Critics After Win Against...   
1        C0DEED  ‰ÛÏIt felt like they were my friends and I was...   
2        C0DEED  i absolutely adore when louis starts the songs...   
3        C0DEED  Hi @JordanSpieth - Looking at the url - do you...   
4             0  Watching Neighbours on Sky+ catching up with t...   

  tweet_count   tweet_created      tweet_id   tweet_location  \
0      110964  10/26/15 12:40  6.587300e+17  main; @Kan1shk3   
1        7471  10/26/15 12:40  6.587300e+17              NaN   
2        5617  10/26/15 12:40  6.587300e+17           clcncl   
3        1693  10/26/15 12:40  6.587300e+17    Palo Alto, CA   
4       31462  10/26/15 12:40  6.587300e+17              NaN   

                user_timezone  
0                     Chennai  
1  Eastern Time (US & Canada)  
2                    Belgrade  
3  Pacific Time (US & Canada)  
4                         NaN  

[5 rows x 22 columns]

---- EXPLORATORY DATA ANALYSIS (EDA) ----
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Number of NaN values in 'link_color': 0.
Number of NaN values in 'sidebar_color': 0.
Number of link color is 2986.
Number of side bar color is 559.
Number of NaN values in 'link_color': 0
Number of NaN values in 'sidebar_color': 0
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Unique Values in 'gender'
[0 1 2]
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   description             15522 non-null  object 
 3   favorites_per_day       18836 non-null  float64
 4   link_color              18836 non-null  object 
 5   retweets_per_day        18836 non-null  float64
 6   sidebar_color           18836 non-null  object 
 7   text                    18836 non-null  object 
 8   tweets_per_day          18836 non-null  float64
 9   user_timezone           18836 non-null  object 
 10  tweet_location          18836 non-null  object 
 11  profile_created_year    18836 non-null  int32  
 12  tweet_created_year      18836 non-null  int32  
 13  tweet_location_encoded  18836 non-null  float64
 14  user_timezone_encoded   18836 non-null  float64
dtypes: float64(6), int32(2), int64(1), object(6)
memory usage: 2.2+ MB
None
No description has been provided for this image
All Remaining Features
['gender', 'gender:confidence', 'description', 'favorites_per_day', 'retweets_per_day', 'text', 'tweets_per_day', 'profile_created_year', 'tweet_created_year', 'tweet_location_encoded', 'user_timezone_encoded']

Dataset Overview After PreProcessing
<class 'pandas.core.frame.DataFrame'>
Index: 18836 entries, 0 to 20049
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  18836 non-null  int64  
 1   gender:confidence       18836 non-null  float64
 2   description             15522 non-null  object 
 3   favorites_per_day       18836 non-null  float64
 4   retweets_per_day        18836 non-null  float64
 5   text                    18836 non-null  object 
 6   tweets_per_day          18836 non-null  float64
 7   profile_created_year    18836 non-null  int32  
 8   tweet_created_year      18836 non-null  int32  
 9   tweet_location_encoded  18836 non-null  float64
 10  user_timezone_encoded   18836 non-null  float64
dtypes: float64(6), int32(2), int64(1), object(2)
memory usage: 1.6+ MB
None

---- NLP Processing ----

                                             description  \
0                                  i sing my own rhythm.   
1      I'm the author of novels filled with family dr...   
2                    louis whining and squealing and all   
3      Mobile guy.  49ers, Shazam, Google, Kleiner Pe...   
4      Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...   
...                                                  ...   
20045                                               (rp)   
20046  Whatever you like, it's not a problem at all. ...   
20047  #TeamBarcelona ..You look lost so you should f...   
20048  Anti-statist; I homeschool my kids. Aspiring t...   
20049                     Teamwork makes the dream work.   

                                                    text  
0      Robbie E Responds To Critics After Win Against...  
1      ‰ÛÏIt felt like they were my friends and I was...  
2      i absolutely adore when louis starts the songs...  
3      Hi @JordanSpieth - Looking at the url - do you...  
4      Watching Neighbours on Sky+ catching up with t...  
...                                                  ...  
20045  @lookupondeath ...Fine, and I'll drink tea too...  
20046  Greg Hardy you a good player and all but don't...  
20047  You can miss people and still never want to se...  
20048  @bitemyapp i had noticed your tendency to pee ...  
20049  I think for my APUSH creative project I'm goin...  

[18836 rows x 2 columns]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
                                         description  \
0                              i sing my own rhythm.   
1  I'm the author of novels filled with family dr...   
2                louis whining and squealing and all   
3  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...   
4  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...   

                                 cleaned_description  \
0                                        sing rhythm   
1        im author novel filled family drama romance   
2                            louis whining squealing   
3  mobile guy er shazam google kleiner perkins ya...   
4  ricky wilson best frontmankaiser chief best ba...   

                                                text  \
0  Robbie E Responds To Critics After Win Against...   
1  ‰ÛÏIt felt like they were my friends and I was...   
2  i absolutely adore when louis starts the songs...   
3  Hi @JordanSpieth - Looking at the url - do you...   
4  Watching Neighbours on Sky+ catching up with t...   

                                        cleaned_text  
0  robbie e responds critic win eddie edward worl...  
1  felt like friend living story httpstcoarngeyhn...  
2  absolutely adore louis start song hit hard fee...  
3  hi jordanspieth looking url use ifttt dont typ...  
4    watching neighbour sky catching neighbs xxx xxx  

Applying TF-IDF Vectorisation...

CLUSTERING¶

In [2]:
print()
print()
print('---- CLUSTERING MODELS ----')

print()
print("=" * 50)
print('EXP 1: USING ALL SELECTED FEATURES')
print("=" * 50)

sil_ex1 = []
cal_ex1 = []
# Drop the gender and categorical features before normalise

df_cat = df_cate.copy()
# Drop gender feature and categorical features
df_preprocessed = df_preprocessed.drop(columns=df_cat.columns)
df_finalised = df_preprocessed.drop(columns=['gender', 'gender:confidence'])

# Normalise every existing feature
scaler = StandardScaler()
df_finalised = pd.DataFrame(scaler.fit_transform(df_finalised), columns=df_finalised.columns)

df_finalised = pd.concat([df_finalised, df_cat, df_gender], axis=1)
# find the rows that contained NaN values and drop them
df_finalised = df_finalised.dropna()

data_exp1 = df_finalised
df_ex1 = df_finalised.drop(columns=['gender', 'gender:confidence'])


# Check the preprocessed dataset in the present
print()
print('Dataset for Exp 1')
print(df_ex1.info())
print()

# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_vis = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(df_ex1)
umap_plot = umap_vis.fit_transform(df_ex1)
print(umap_embedding.shape)

# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding)
kmeans_clustering.tune_hyperparameters()
kmeans_exp1 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_plot, 'All feature types')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex1.append(kmeans_clustering.silhoutte())
cal_ex1.append(kmeans_clustering.calinski())

k_retriever = ClusteringDataRetriever(data_exp1, k_labels)
df_with_labels = k_retriever.get_data_with_labels()

print()
print('Dataset with Labels from KMeans in Exp 1')
print(df_with_labels.head())
for label in np.unique(k_labels):
    print()
    print(f'Records found in cluster {label} from KMeans in Exp 1')
    print(k_retriever.get_cluster_data(label))
    print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')

# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding)
dbscan_clustering.tune_hyperparameters()
dbscan_exp1 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_plot, 'All feature types')
db_labels = dbscan_clustering.output_label()
sil_ex1.append(dbscan_clustering.silhoutte())
cal_ex1.append(dbscan_clustering.calinski())

# Initialize the class to retrieve data
db_retriever = ClusteringDataRetriever(data_exp1, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 1')
print(df_with_labels.head())
for label in np.unique(db_labels):
    if label != -1:
        print()
        print(f'Records found in cluster {label} from DBSCAN in Exp 1')
        print(db_retriever.get_cluster_data(label))
        print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())

print()
print("=" * 50)
print('EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES')
print("=" * 50)

sil_ex2 = []
cal_ex2 = []

# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_num.shape[0], chunk_size):
    df_num.iloc[i:i + chunk_size] = scaler.fit_transform(df_num.iloc[i:i + chunk_size])
df_no_text = pd.concat([df_num, df_cate, df_gender], axis=1)
print()
print("Data with Only Numerical and Categorical Features")
print(df_no_text.info())
print()

df_no_text = df_no_text.dropna()
df_no_text_wg = df_no_text.copy()
print('Removing NaN values...')

# Drop gender feature before clustering
data_exp2 = df_no_text.drop(columns=['gender', 'gender:confidence'])
print('Dropping gender and gender:confidence...')

# Check No. of records after drop NaN values
print()
print("Dataset for Exp 2")
print(data_exp2.info())
print()
print(data_exp2.head())

# Apply UMAP for dimensionality reduction
print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42)
umap_embedding = umap_model.fit_transform(data_exp2)
print(umap_embedding.shape)
# umap_embedding = umap_embedding.astype(np.float32)

# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(data_exp2)
kmeans_clustering.tune_hyperparameters()
kmeans_exp2 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Numerical and categorical features')  # Visualize clusters
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex2.append(kmeans_clustering.silhoutte())
cal_ex2.append(kmeans_clustering.calinski())

k_retriever = ClusteringDataRetriever(df_no_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 2')
print(df_with_labels.head())
for label in np.unique(k_labels):
    print()
    print(f'Records found in cluster {label} from KMeans in Exp 2')
    print(k_retriever.get_cluster_data(label))
    print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')

# DBSCAN Clustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(data_exp2)
dbscan_clustering.tune_hyperparameters()  # Tune DBSCAN hyperparameters
dbscan_exp2 = dbscan_clustering.fit_model()  # Fit the DBSCAN model
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'numerical and categorical features')  # Plot 3D noise points and valid clusters
db_labels = dbscan_clustering.output_label()
sil_ex2.append(dbscan_clustering.silhoutte())
cal_ex2.append(dbscan_clustering.calinski())


db_retriever = ClusteringDataRetriever(df_no_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 2')
print(df_with_labels.head())
for label in np.unique(db_labels):
    if label != -1:
        print()
        print(f'Records found in cluster {label} from DBSCAN in Exp 2')
        print(db_retriever.get_cluster_data(label))
        print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())

print()
print("=" * 50)
print('EXP 3: USING ONLY TEXT FEATURES')
print("=" * 50)

sil_ex3 = []
cal_ex3 = []
# Merge with main dataframe
df_with_text = pd.concat([tfidf_desc_df, tfidf_text_df], axis=1)
# Normalise every existing feature
scaler = StandardScaler()
chunk_size = 100
for i in range(0, df_with_text.shape[0], chunk_size):
    df_with_text.iloc[i:i + chunk_size] = scaler.fit_transform(df_with_text.iloc[i:i + chunk_size])

df_with_text_wg = pd.concat([df_with_text, df_gender], axis=1)
# Drop NaN values before clustering
df_with_text_wg = df_with_text_wg.dropna()
data_exp3 = df_with_text_wg.drop(columns=['gender', 'gender:confidence'])

# Drop the gender features before clustering

print('Dataset for Exp 3')
print(data_exp3.info())
print()
print(data_exp3.head())

print('Applying UMAP for dim reduction...')
umap_model = umap.UMAP()
umap_embedding_t = umap_model.fit_transform(data_exp3)
umap_embedding = umap.UMAP(n_neighbors=30,min_dist=0.1, n_components=3, random_state=42).fit_transform(data_exp3)

# K-Means Clustering
print()
print('Performing K-Means Clustering...')
kmeans_clustering = KMeansClustering(umap_embedding_t)
kmeans_clustering.tune_hyperparameters()
kmeans_exp3 = kmeans_clustering.fit_model()
kmeans_clustering.visualize_clusters(umap_embedding, 'Text features')
kmeans_clustering.plot_elbow_method()
k_labels = kmeans_clustering.output_label()
sil_ex3.append(kmeans_clustering.silhoutte())
cal_ex3.append(kmeans_clustering.calinski())

k_retriever = ClusteringDataRetriever(df_with_text_wg, k_labels)
df_with_labels = k_retriever.get_data_with_labels()
print()
print('Dataset with Labels from KMeans in Exp 3')
print(df_with_labels.head())
for label in np.unique(k_labels):
    print()
    print(f'Records found in cluster {label} from KMeans in Exp 3')
    print(k_retriever.get_cluster_data(label))
    print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
    print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')

# DBSCANClustering
print()
print('Performing DBSCAN Clustering...')
dbscan_clustering = DBSCANClustering(umap_embedding_t)
dbscan_clustering.tune_hyperparameters()
dbscan_exp3 = dbscan_clustering.fit_model()
dbscan_clustering.visualize_clusters_and_outliers_3D(umap_embedding, 'Text features')
db_labels = dbscan_clustering.output_label()
sil_ex3.append(dbscan_clustering.silhoutte())
cal_ex3.append(dbscan_clustering.calinski())

db_retriever = ClusteringDataRetriever(df_with_text_wg, db_labels)
df_with_labels = db_retriever.get_data_with_labels()
print()
print('Dataset with Labels from DBSCAN in Exp 3')
print(df_with_labels.head())
for label in np.unique(db_labels):
    if label != -1:
        print()
        print(f'Records found in cluster {label} from DBSCAN in Exp 3')
        print(db_retriever.get_cluster_data(label))
        print(f'No. of records with gender 0 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 0) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 1 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 1) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
        print(f'No. of records with gender 2 in cluster {label} is {df_with_labels[(df_with_labels["gender"] == 2) & (df_with_labels["Cluster_Label"] == label)].shape[0]}')
print('Records classified as noise')
print(db_retriever.get_noise_data())

print()
print('---- VISUALIZE THE METRIC EVALUATION ----')

# Metric functions
model_names = ['KMeans', 'DBSCAN']

sil_scores = [sil_ex1, sil_ex2, sil_ex3]
cal_scores = [cal_ex1, cal_ex2, cal_ex3]

plot_silhouette_bar_across_experiments(model_names, sil_scores)
visualize_ch_index_across_experiments(model_names, cal_scores)

---- CLUSTERING MODELS ----

==================================================
EXP 1: USING ALL SELECTED FEATURES
==================================================

Dataset for Exp 1
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Columns: 3013 entries, favorites_per_day to user_timezone_encoded
dtypes: float64(3013)
memory usage: 407.1 MB
None

Applying UMAP for dim reduction...
[I 2024-09-19 17:45:49,716] A new study created in memory with name: no-name-33c6aaa9-9d16-4038-a5a4-495c225843be
(17702, 2)

Performing K-Means Clustering...
[I 2024-09-19 17:45:54,957] Trial 0 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:45:59,799] Trial 1 finished with value: 0.4471836984157562 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:04,381] Trial 2 finished with value: 0.3998541533946991 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:09,241] Trial 3 finished with value: 0.4395274221897125 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:13,853] Trial 4 finished with value: 0.41091856360435486 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:18,491] Trial 5 finished with value: 0.4032902717590332 and parameters: {'n_clusters': 10, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:23,372] Trial 6 finished with value: 0.43896961212158203 and parameters: {'n_clusters': 3, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:28,029] Trial 7 finished with value: 0.39738479256629944 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:32,695] Trial 8 finished with value: 0.42352718114852905 and parameters: {'n_clusters': 7, 'init': 'random'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:37,844] Trial 9 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:42,705] Trial 10 finished with value: 0.4405798316001892 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:47,858] Trial 11 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:53,026] Trial 12 finished with value: 0.8162233233451843 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:46:57,961] Trial 13 finished with value: 0.4405798316001892 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
[I 2024-09-19 17:47:02,826] Trial 14 finished with value: 0.448025643825531 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 0 with value: 0.8162233233451843.
Best params: {'n_clusters': 2, 'init': 'k-means++'}
No description has been provided for this image
No description has been provided for this image
The Silhouette score is 0.8162233233451843
The Callinski index is 53467.58203125

Dataset with Labels from KMeans in Exp 1
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from KMeans in Exp 1
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[15998 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5318
No. of records with gender 1 in cluster 0 is 5676
No. of records with gender 2 in cluster 0 is 5004

Records found in cluster 1 from KMeans in Exp 1
[I 2024-09-19 17:47:09,518] A new study created in memory with name: no-name-de4af118-ab6a-4e35-b187-5867231e4373
       gender  gender:confidence  Cluster_Label
7         0.0             1.0000              1
33        0.0             1.0000              1
49        2.0             1.0000              1
56        1.0             0.6684              1
58        0.0             1.0000              1
...       ...                ...            ...
18731     1.0             1.0000              1
18738     2.0             1.0000              1
18753     0.0             0.6678              1
18789     0.0             1.0000              1
18803     1.0             1.0000              1

[1704 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 525
No. of records with gender 1 in cluster 1 is 525
No. of records with gender 2 in cluster 1 is 654

Performing DBSCAN Clustering...
[I 2024-09-19 17:47:15,062] Trial 0 finished with value: 0.44018059968948364 and parameters: {'eps': 0.6825546761974374, 'min_samples': 12}. Best is trial 0 with value: 0.44018059968948364.
[I 2024-09-19 17:47:20,930] Trial 1 finished with value: 0.549675703048706 and parameters: {'eps': 1.0905561932063113, 'min_samples': 5}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:26,373] Trial 2 finished with value: 0.4102407693862915 and parameters: {'eps': 0.4933035459866175, 'min_samples': 17}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:32,394] Trial 3 finished with value: 0.5234434604644775 and parameters: {'eps': 1.3809801879490313, 'min_samples': 11}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:38,584] Trial 4 finished with value: 0.5330817103385925 and parameters: {'eps': 1.7208744440250079, 'min_samples': 5}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:43,984] Trial 5 finished with value: 0.03209728002548218 and parameters: {'eps': 0.41497364231347533, 'min_samples': 11}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:49,391] Trial 6 finished with value: 0.0401240736246109 and parameters: {'eps': 0.4164881924972801, 'min_samples': 9}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:47:54,869] Trial 7 finished with value: 0.463115930557251 and parameters: {'eps': 0.5467979615960828, 'min_samples': 12}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:48:00,018] Trial 8 finished with value: -0.3428648114204407 and parameters: {'eps': 0.14310863827181103, 'min_samples': 14}. Best is trial 1 with value: 0.549675703048706.
[I 2024-09-19 17:48:06,007] Trial 9 finished with value: 0.5529610514640808 and parameters: {'eps': 1.2952162564617473, 'min_samples': 20}. Best is trial 9 with value: 0.5529610514640808.
[I 2024-09-19 17:48:12,367] Trial 10 finished with value: 0.5906190872192383 and parameters: {'eps': 1.9669814522582851, 'min_samples': 19}. Best is trial 10 with value: 0.5906190872192383.
[I 2024-09-19 17:48:18,750] Trial 11 finished with value: 0.5906190872192383 and parameters: {'eps': 1.9611053257653026, 'min_samples': 20}. Best is trial 10 with value: 0.5906190872192383.
[I 2024-09-19 17:48:25,097] Trial 12 finished with value: 0.5911170840263367 and parameters: {'eps': 1.9300803447293904, 'min_samples': 20}. Best is trial 12 with value: 0.5911170840263367.
[I 2024-09-19 17:48:31,496] Trial 13 finished with value: 0.5892439484596252 and parameters: {'eps': 1.9952355283145653, 'min_samples': 17}. Best is trial 12 with value: 0.5911170840263367.
[I 2024-09-19 17:48:37,700] Trial 14 finished with value: 0.5957051515579224 and parameters: {'eps': 1.68208869635505, 'min_samples': 17}. Best is trial 14 with value: 0.5957051515579224.
Found best params: {'eps': 1.68208869635505, 'min_samples': 17}
No description has been provided for this image
The Silhouette score is 0.5957051515579224
The Callinski index is 3768.14453125

Dataset with Labels from DBSCAN in Exp 1
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[15909 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5293
No. of records with gender 1 in cluster 0 is 5650
No. of records with gender 2 in cluster 0 is 4966

Records found in cluster 1 from DBSCAN in Exp 1
     gender  gender:confidence  Cluster_Label
7       0.0             1.0000              1
33      0.0             1.0000              1
49      2.0             1.0000              1
56      1.0             0.6684              1
58      0.0             1.0000              1
132     1.0             1.0000              1
153     2.0             1.0000              1
191     2.0             0.6804              1
192     0.0             1.0000              1
199     1.0             1.0000              1
231     1.0             1.0000              1
243     0.0             1.0000              1
250     2.0             1.0000              1
288     1.0             0.6494              1
308     1.0             0.6752              1
390     1.0             0.6786              1
460     2.0             0.6708              1
503     0.0             1.0000              1
No. of records with gender 0 in cluster 1 is 6
No. of records with gender 1 in cluster 1 is 7
No. of records with gender 2 in cluster 1 is 5

Records found in cluster 2 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
282       1.0             1.0000              2
302       1.0             1.0000              2
1402      0.0             0.3539              2
1544      0.0             1.0000              2
2154      1.0             0.6561              2
2347      2.0             0.6757              2
2929      0.0             1.0000              2
2964      1.0             1.0000              2
3229      0.0             1.0000              2
3341      1.0             1.0000              2
3770      0.0             1.0000              2
3938      2.0             0.6545              2
4650      2.0             0.3571              2
5206      1.0             1.0000              2
5367      0.0             1.0000              2
5424      0.0             1.0000              2
5629      2.0             1.0000              2
5634      2.0             0.6840              2
5640      0.0             1.0000              2
5944      1.0             1.0000              2
6093      1.0             0.6653              2
6157      2.0             0.6567              2
6174      2.0             0.6619              2
6313      1.0             1.0000              2
6409      0.0             1.0000              2
6514      1.0             1.0000              2
7625      0.0             1.0000              2
8798      1.0             1.0000              2
13356     1.0             1.0000              2
No. of records with gender 0 in cluster 2 is 10
No. of records with gender 1 in cluster 2 is 12
No. of records with gender 2 in cluster 2 is 7

Records found in cluster 3 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
426       2.0             1.0000              3
431       0.0             0.6631              3
432       0.0             1.0000              3
1992      0.0             1.0000              3
2776      0.0             1.0000              3
3755      2.0             1.0000              3
3769      2.0             0.6497              3
3784      2.0             1.0000              3
4374      2.0             1.0000              3
4418      1.0             1.0000              3
4456      1.0             1.0000              3
4653      2.0             1.0000              3
4995      2.0             1.0000              3
5008      2.0             1.0000              3
5044      2.0             1.0000              3
5196      1.0             1.0000              3
5220      2.0             0.6650              3
5352      1.0             1.0000              3
5372      2.0             1.0000              3
5533      2.0             1.0000              3
5580      0.0             1.0000              3
5596      2.0             1.0000              3
5627      2.0             0.6559              3
5662      1.0             1.0000              3
5749      2.0             1.0000              3
5919      2.0             1.0000              3
5988      2.0             1.0000              3
6208      1.0             0.6543              3
6496      2.0             0.6716              3
6669      0.0             1.0000              3
7060      1.0             0.6890              3
7261      0.0             1.0000              3
7439      0.0             1.0000              3
7683      1.0             0.6699              3
7702      2.0             0.7012              3
7771      2.0             1.0000              3
7894      0.0             1.0000              3
7898      2.0             1.0000              3
7902      0.0             1.0000              3
8120      1.0             1.0000              3
8248      1.0             1.0000              3
8295      2.0             0.6579              3
8360      2.0             0.6854              3
8408      0.0             1.0000              3
8933      1.0             1.0000              3
8984      2.0             0.6890              3
9100      0.0             1.0000              3
9341      2.0             1.0000              3
9379      0.0             1.0000              3
10138     1.0             1.0000              3
10451     0.0             0.6824              3
13349     0.0             1.0000              3
14425     0.0             0.6628              3
14668     2.0             1.0000              3
16449     1.0             1.0000              3
16881     1.0             0.6733              3
No. of records with gender 0 in cluster 3 is 16
No. of records with gender 1 in cluster 3 is 14
No. of records with gender 2 in cluster 3 is 26

Records found in cluster 4 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
502      0.0             1.0000              4
578      1.0             1.0000              4
644      0.0             1.0000              4
771      0.0             1.0000              4
963      2.0             1.0000              4
...      ...                ...            ...
9150     1.0             1.0000              4
9165     0.0             1.0000              4
9216     2.0             0.6519              4
9221     2.0             1.0000              4
9243     0.0             0.3506              4

[175 rows x 3 columns]
No. of records with gender 0 in cluster 4 is 52
No. of records with gender 1 in cluster 4 is 52
No. of records with gender 2 in cluster 4 is 71

Records found in cluster 5 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
513       2.0             1.0000              5
514       0.0             1.0000              5
520       0.0             0.3458              5
553       0.0             1.0000              5
554       0.0             0.3431              5
555       0.0             1.0000              5
556       0.0             1.0000              5
557       0.0             1.0000              5
560       1.0             1.0000              5
564       1.0             1.0000              5
565       1.0             1.0000              5
566       2.0             0.6829              5
576       0.0             1.0000              5
577       2.0             1.0000              5
1102      1.0             0.6777              5
2660      0.0             0.3478              5
7995      2.0             1.0000              5
8037      0.0             0.6374              5
8233      0.0             1.0000              5
10824     0.0             1.0000              5
No. of records with gender 0 in cluster 5 is 12
No. of records with gender 1 in cluster 5 is 4
No. of records with gender 2 in cluster 5 is 4

Records found in cluster 6 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
570       2.0             0.6616              6
2860      0.0             1.0000              6
2862      0.0             1.0000              6
2863      0.0             0.3370              6
2866      2.0             0.6497              6
2870      2.0             0.6368              6
2872      0.0             0.6855              6
2873      1.0             0.6940              6
2996      1.0             1.0000              6
3168      1.0             1.0000              6
4767      1.0             0.6774              6
5853      2.0             0.6619              6
8255      2.0             0.6672              6
9773      0.0             0.6607              6
10211     1.0             1.0000              6
10698     1.0             0.6795              6
11317     2.0             1.0000              6
11909     1.0             1.0000              6
12736     1.0             0.6619              6
14216     1.0             1.0000              6
14307     2.0             0.6617              6
14448     0.0             1.0000              6
14613     0.0             1.0000              6
14791     1.0             1.0000              6
15015     1.0             1.0000              6
15216     0.0             1.0000              6
15333     1.0             1.0000              6
15424     0.0             0.6608              6
15800     1.0             1.0000              6
16873     1.0             1.0000              6
17596     1.0             1.0000              6
18337     1.0             1.0000              6
No. of records with gender 0 in cluster 6 is 9
No. of records with gender 1 in cluster 6 is 16
No. of records with gender 2 in cluster 6 is 7

Records found in cluster 7 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
575      0.0             1.0000              7
1308     0.0             0.6479              7
2033     1.0             1.0000              7
2308     1.0             0.6774              7
3898     0.0             1.0000              7
5454     2.0             0.6774              7
5539     1.0             1.0000              7
5628     2.0             1.0000              7
5825     1.0             1.0000              7
5847     2.0             0.6717              7
6012     0.0             1.0000              7
6048     2.0             0.6796              7
6108     0.0             1.0000              7
6114     1.0             0.6620              7
6335     2.0             1.0000              7
6382     2.0             0.6842              7
6417     2.0             1.0000              7
7843     2.0             1.0000              7
8181     0.0             1.0000              7
8355     2.0             0.6778              7
8738     0.0             1.0000              7
No. of records with gender 0 in cluster 7 is 7
No. of records with gender 1 in cluster 7 is 5
No. of records with gender 2 in cluster 7 is 9

Records found in cluster 8 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
599       1.0             1.0000              8
1268      2.0             1.0000              8
2682      1.0             0.6473              8
3360      1.0             1.0000              8
5548      2.0             1.0000              8
6616      1.0             1.0000              8
7610      2.0             0.6578              8
8509      2.0             0.6731              8
9305      2.0             0.6606              8
9515      0.0             0.6648              8
10396     1.0             1.0000              8
10608     1.0             1.0000              8
10796     0.0             0.6912              8
10981     0.0             1.0000              8
11477     2.0             1.0000              8
11770     2.0             1.0000              8
12451     2.0             1.0000              8
12803     1.0             0.6667              8
12996     1.0             1.0000              8
13263     2.0             0.6743              8
13436     0.0             1.0000              8
14141     0.0             1.0000              8
14290     0.0             1.0000              8
14473     0.0             1.0000              8
14878     2.0             0.6502              8
15088     0.0             0.6581              8
15727     2.0             1.0000              8
16605     0.0             0.6578              8
16973     0.0             1.0000              8
17197     1.0             1.0000              8
17330     0.0             1.0000              8
17728     1.0             0.6702              8
18071     2.0             1.0000              8
18531     2.0             1.0000              8
No. of records with gender 0 in cluster 8 is 11
No. of records with gender 1 in cluster 8 is 10
No. of records with gender 2 in cluster 8 is 13

Records found in cluster 9 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
725       0.0             1.0000              9
1203      1.0             1.0000              9
1240      1.0             0.6889              9
2115      0.0             1.0000              9
2381      0.0             1.0000              9
3988      2.0             1.0000              9
5994      2.0             0.6611              9
7988      1.0             0.6734              9
8071      1.0             1.0000              9
10735     0.0             1.0000              9
10738     0.0             1.0000              9
11076     2.0             1.0000              9
11179     2.0             1.0000              9
11484     1.0             1.0000              9
11648     1.0             1.0000              9
11746     0.0             1.0000              9
12054     1.0             1.0000              9
13078     0.0             1.0000              9
14056     2.0             1.0000              9
15064     0.0             0.6534              9
15751     1.0             1.0000              9
15757     1.0             1.0000              9
16465     0.0             1.0000              9
16868     1.0             1.0000              9
17448     0.0             1.0000              9
18208     0.0             1.0000              9
18753     0.0             0.6678              9
No. of records with gender 0 in cluster 9 is 12
No. of records with gender 1 in cluster 9 is 10
No. of records with gender 2 in cluster 9 is 5

Records found in cluster 10 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
822       0.0             0.6473             10
1536      2.0             0.6591             10
2971      1.0             1.0000             10
10714     0.0             1.0000             10
11119     1.0             1.0000             10
11627     2.0             0.6796             10
11727     2.0             1.0000             10
12324     1.0             1.0000             10
12333     1.0             1.0000             10
12992     0.0             1.0000             10
13486     2.0             1.0000             10
13980     0.0             1.0000             10
14046     0.0             1.0000             10
14170     1.0             1.0000             10
14958     2.0             1.0000             10
15223     0.0             1.0000             10
15597     1.0             0.3362             10
15889     2.0             0.3383             10
16706     0.0             1.0000             10
16735     0.0             0.6563             10
17090     0.0             1.0000             10
17186     1.0             1.0000             10
17599     0.0             0.6654             10
18270     0.0             1.0000             10
No. of records with gender 0 in cluster 10 is 11
No. of records with gender 1 in cluster 10 is 7
No. of records with gender 2 in cluster 10 is 6

Records found in cluster 11 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
1040     1.0             1.0000             11
1045     2.0             0.6789             11
1049     1.0             1.0000             11
1051     2.0             1.0000             11
1052     1.0             1.0000             11
1054     1.0             1.0000             11
1061     0.0             1.0000             11
1064     1.0             0.6498             11
1065     0.0             1.0000             11
3581     0.0             1.0000             11
3705     2.0             0.6581             11
3809     2.0             1.0000             11
3906     1.0             0.6422             11
4041     0.0             1.0000             11
4156     1.0             1.0000             11
4272     2.0             1.0000             11
4341     0.0             1.0000             11
4410     2.0             1.0000             11
4508     1.0             1.0000             11
4631     2.0             1.0000             11
4736     2.0             1.0000             11
4840     2.0             1.0000             11
5305     1.0             1.0000             11
No. of records with gender 0 in cluster 11 is 5
No. of records with gender 1 in cluster 11 is 9
No. of records with gender 2 in cluster 11 is 9

Records found in cluster 12 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
1108      1.0             0.6880             12
9382      2.0             1.0000             12
9398      1.0             1.0000             12
9475      0.0             1.0000             12
9496      0.0             1.0000             12
...       ...                ...            ...
15207     1.0             1.0000             12
15391     2.0             1.0000             12
15439     2.0             1.0000             12
15622     2.0             1.0000             12
18398     0.0             0.6709             12

[70 rows x 3 columns]
No. of records with gender 0 in cluster 12 is 19
No. of records with gender 1 in cluster 12 is 25
No. of records with gender 2 in cluster 12 is 26

Records found in cluster 13 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
1273      0.0             1.0000             13
1605      2.0             1.0000             13
1761      2.0             1.0000             13
1845      1.0             1.0000             13
1987      1.0             1.0000             13
2274      0.0             1.0000             13
3723      1.0             1.0000             13
3961      0.0             1.0000             13
4092      0.0             0.3411             13
4424      2.0             1.0000             13
4898      0.0             1.0000             13
5218      2.0             1.0000             13
5276      2.0             0.6632             13
5336      1.0             1.0000             13
5379      0.0             1.0000             13
5445      0.0             1.0000             13
5536      2.0             0.6943             13
5927      2.0             0.6721             13
5949      1.0             0.6848             13
5980      0.0             1.0000             13
6017      1.0             0.3486             13
6245      2.0             1.0000             13
6262      2.0             1.0000             13
6289      1.0             1.0000             13
6298      0.0             1.0000             13
6466      2.0             1.0000             13
7003      1.0             1.0000             13
7118      2.0             1.0000             13
7431      1.0             1.0000             13
7540      0.0             0.6859             13
7791      1.0             1.0000             13
8142      2.0             1.0000             13
8601      2.0             0.6700             13
8693      0.0             1.0000             13
9023      1.0             0.6654             13
9265      1.0             1.0000             13
15378     1.0             1.0000             13
No. of records with gender 0 in cluster 13 is 11
No. of records with gender 1 in cluster 13 is 13
No. of records with gender 2 in cluster 13 is 13

Records found in cluster 14 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
2138     1.0             1.0000             14
2145     0.0             1.0000             14
2146     1.0             1.0000             14
2147     1.0             1.0000             14
2148     1.0             0.3576             14
2156     0.0             1.0000             14
2166     1.0             1.0000             14
2168     0.0             0.6825             14
2169     1.0             1.0000             14
2171     1.0             1.0000             14
2172     0.0             1.0000             14
2182     2.0             1.0000             14
2185     0.0             1.0000             14
2186     0.0             0.3403             14
2187     1.0             1.0000             14
2188     2.0             0.6812             14
2189     0.0             0.6582             14
2191     0.0             1.0000             14
2194     1.0             1.0000             14
2196     1.0             1.0000             14
2204     1.0             0.6587             14
2205     0.0             0.6685             14
2206     1.0             0.6551             14
2207     1.0             1.0000             14
2210     1.0             1.0000             14
2216     1.0             0.6896             14
2217     1.0             0.6832             14
2220     1.0             1.0000             14
2223     2.0             1.0000             14
No. of records with gender 0 in cluster 14 is 9
No. of records with gender 1 in cluster 14 is 17
No. of records with gender 2 in cluster 14 is 3

Records found in cluster 15 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
2445      1.0             1.0000             15
4210      2.0             1.0000             15
11871     1.0             1.0000             15
14380     0.0             0.3398             15
14935     2.0             0.6634             15
14972     1.0             0.6475             15
15079     2.0             1.0000             15
15173     0.0             1.0000             15
15186     0.0             1.0000             15
15228     1.0             1.0000             15
15231     0.0             1.0000             15
15234     0.0             1.0000             15
15236     1.0             1.0000             15
15278     2.0             1.0000             15
15287     1.0             0.6880             15
15288     2.0             1.0000             15
15292     2.0             1.0000             15
15295     2.0             1.0000             15
15313     2.0             1.0000             15
15316     2.0             1.0000             15
15322     0.0             1.0000             15
15324     2.0             0.6344             15
15338     1.0             0.6791             15
No. of records with gender 0 in cluster 15 is 6
No. of records with gender 1 in cluster 15 is 7
No. of records with gender 2 in cluster 15 is 10

Records found in cluster 16 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
2475      0.0                1.0             16
4315      1.0                1.0             16
5147      1.0                1.0             16
9579      2.0                1.0             16
17729     0.0                1.0             16
...       ...                ...            ...
18371     2.0                1.0             16
18372     2.0                1.0             16
18373     0.0                1.0             16
18374     1.0                1.0             16
18375     0.0                1.0             16

[156 rows x 3 columns]
No. of records with gender 0 in cluster 16 is 67
No. of records with gender 1 in cluster 16 is 60
No. of records with gender 2 in cluster 16 is 29

Records found in cluster 17 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
3385     1.0             1.0000             17
3386     1.0             0.6628             17
3388     2.0             1.0000             17
3391     0.0             0.6612             17
3393     1.0             1.0000             17
3394     1.0             1.0000             17
3396     1.0             1.0000             17
3397     0.0             1.0000             17
3398     2.0             1.0000             17
3400     1.0             0.6727             17
3401     2.0             1.0000             17
3402     0.0             1.0000             17
3406     0.0             0.6819             17
3407     1.0             1.0000             17
3411     0.0             1.0000             17
3412     1.0             1.0000             17
3413     1.0             0.7023             17
No. of records with gender 0 in cluster 17 is 5
No. of records with gender 1 in cluster 17 is 9
No. of records with gender 2 in cluster 17 is 3

Records found in cluster 18 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
3744     0.0             0.6440             18
3927     0.0             1.0000             18
3994     1.0             1.0000             18
4057     2.0             0.3516             18
4300     2.0             0.6736             18
4398     1.0             1.0000             18
4470     2.0             0.6602             18
4544     0.0             1.0000             18
4640     2.0             1.0000             18
4800     2.0             0.6575             18
4883     2.0             1.0000             18
5043     1.0             1.0000             18
5238     1.0             1.0000             18
5325     1.0             0.6645             18
5515     2.0             1.0000             18
5659     1.0             1.0000             18
5978     2.0             1.0000             18
6188     2.0             0.6748             18
6440     2.0             1.0000             18
6562     0.0             1.0000             18
6671     2.0             1.0000             18
6749     1.0             1.0000             18
6826     2.0             0.6933             18
7050     0.0             0.6736             18
No. of records with gender 0 in cluster 18 is 5
No. of records with gender 1 in cluster 18 is 7
No. of records with gender 2 in cluster 18 is 12

Records found in cluster 19 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
3878      0.0             0.6691             19
4606      0.0             1.0000             19
4627      2.0             1.0000             19
4690      0.0             0.6763             19
4712      0.0             1.0000             19
...       ...                ...            ...
9294      0.0             1.0000             19
9313      2.0             0.6841             19
11175     0.0             1.0000             19
13999     0.0             0.6649             19
18789     0.0             1.0000             19

[275 rows x 3 columns]
No. of records with gender 0 in cluster 19 is 81
No. of records with gender 1 in cluster 19 is 63
No. of records with gender 2 in cluster 19 is 131

Records found in cluster 20 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
4012     1.0             1.0000             20
4097     0.0             0.6706             20
4100     2.0             1.0000             20
4177     0.0             0.6729             20
4219     0.0             1.0000             20
...      ...                ...            ...
5777     2.0             0.6638             20
5809     0.0             1.0000             20
5849     0.0             0.6792             20
5881     2.0             1.0000             20
5910     0.0             0.6787             20

[112 rows x 3 columns]
No. of records with gender 0 in cluster 20 is 40
No. of records with gender 1 in cluster 20 is 27
No. of records with gender 2 in cluster 20 is 45

Records found in cluster 21 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
4146     0.0             1.0000             21
5546     1.0             1.0000             21
5644     1.0             0.6725             21
6374     2.0             1.0000             21
6391     1.0             1.0000             21
6688     2.0             1.0000             21
6772     2.0             1.0000             21
6814     1.0             1.0000             21
6882     0.0             0.6879             21
6904     2.0             0.6842             21
7745     1.0             1.0000             21
8159     2.0             1.0000             21
8331     2.0             0.6716             21
8340     2.0             0.6707             21
8487     0.0             0.6806             21
8505     1.0             1.0000             21
8622     0.0             0.6634             21
8690     2.0             1.0000             21
8764     2.0             0.6674             21
8784     2.0             1.0000             21
8834     2.0             1.0000             21
8859     2.0             1.0000             21
8971     1.0             1.0000             21
9055     1.0             1.0000             21
No. of records with gender 0 in cluster 21 is 4
No. of records with gender 1 in cluster 21 is 8
No. of records with gender 2 in cluster 21 is 12

Records found in cluster 22 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
4224     0.0             1.0000             22
4319     0.0             1.0000             22
4392     0.0             0.6567             22
4506     0.0             1.0000             22
4558     2.0             0.6866             22
...      ...                ...            ...
9151     1.0             0.6453             22
9194     2.0             1.0000             22
9195     1.0             1.0000             22
9220     2.0             1.0000             22
9283     2.0             0.6659             22

[97 rows x 3 columns]
No. of records with gender 0 in cluster 22 is 23
No. of records with gender 1 in cluster 22 is 28
No. of records with gender 2 in cluster 22 is 46

Records found in cluster 23 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
4510     2.0             1.0000             23
4657     2.0             0.6751             23
4674     2.0             1.0000             23
4826     1.0             0.6887             23
5007     0.0             1.0000             23
5094     1.0             1.0000             23
5192     2.0             0.6835             23
5471     0.0             1.0000             23
5561     0.0             1.0000             23
5572     1.0             1.0000             23
5598     0.0             1.0000             23
5807     1.0             1.0000             23
5877     1.0             1.0000             23
6063     2.0             0.6930             23
6082     2.0             1.0000             23
6476     0.0             1.0000             23
6505     2.0             1.0000             23
6599     2.0             1.0000             23
6884     2.0             1.0000             23
6983     2.0             1.0000             23
7497     0.0             0.6799             23
7508     0.0             1.0000             23
7509     1.0             1.0000             23
7593     2.0             1.0000             23
7596     0.0             1.0000             23
7652     0.0             0.6772             23
7760     2.0             1.0000             23
7966     0.0             0.6607             23
8050     2.0             1.0000             23
8203     2.0             1.0000             23
8269     0.0             0.6774             23
8313     0.0             1.0000             23
8353     2.0             0.6650             23
8412     1.0             0.6900             23
8478     0.0             1.0000             23
8525     0.0             1.0000             23
8528     1.0             1.0000             23
8531     2.0             0.6681             23
8586     0.0             0.6453             23
8645     2.0             0.6778             23
8699     0.0             1.0000             23
8711     2.0             1.0000             23
8739     0.0             1.0000             23
8849     0.0             0.6906             23
8865     1.0             1.0000             23
8886     2.0             0.3536             23
8923     2.0             1.0000             23
8948     2.0             1.0000             23
8997     0.0             1.0000             23
9056     2.0             1.0000             23
9125     2.0             1.0000             23
9190     1.0             1.0000             23
9293     0.0             1.0000             23
No. of records with gender 0 in cluster 23 is 20
No. of records with gender 1 in cluster 23 is 10
No. of records with gender 2 in cluster 23 is 23

Records found in cluster 24 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
4572      1.0             1.0000             24
4746      1.0             1.0000             24
4970      0.0             1.0000             24
4997      2.0             0.6957             24
5002      1.0             1.0000             24
5069      2.0             0.6832             24
5153      2.0             0.6735             24
5156      2.0             0.6516             24
5200      0.0             1.0000             24
5227      1.0             1.0000             24
5265      1.0             1.0000             24
5319      1.0             1.0000             24
5328      0.0             1.0000             24
5348      0.0             1.0000             24
5351      0.0             1.0000             24
5401      2.0             0.6836             24
5470      2.0             1.0000             24
5511      1.0             1.0000             24
5616      2.0             1.0000             24
5625      0.0             1.0000             24
5632      2.0             0.6651             24
5674      1.0             1.0000             24
5712      1.0             1.0000             24
5793      2.0             0.6675             24
5846      2.0             1.0000             24
5883      2.0             0.6725             24
5904      0.0             1.0000             24
5954      0.0             1.0000             24
5973      2.0             0.6509             24
6071      2.0             0.6524             24
6102      0.0             0.6699             24
6228      0.0             0.6636             24
6293      0.0             1.0000             24
6309      1.0             0.3750             24
6327      2.0             0.6733             24
6400      2.0             1.0000             24
6403      2.0             0.6663             24
6577      2.0             1.0000             24
6579      2.0             0.6762             24
6633      0.0             1.0000             24
6670      0.0             1.0000             24
6758      0.0             0.3469             24
7258      1.0             0.6902             24
7532      2.0             1.0000             24
7681      1.0             1.0000             24
7703      1.0             1.0000             24
7882      0.0             1.0000             24
18803     1.0             1.0000             24
No. of records with gender 0 in cluster 24 is 15
No. of records with gender 1 in cluster 24 is 14
No. of records with gender 2 in cluster 24 is 19

Records found in cluster 25 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
4595      1.0             1.0000             25
4621      1.0             1.0000             25
4685      2.0             1.0000             25
4780      2.0             1.0000             25
4866      2.0             1.0000             25
...       ...                ...            ...
12284     1.0             1.0000             25
12397     0.0             1.0000             25
12507     2.0             1.0000             25
12659     2.0             1.0000             25
12754     2.0             0.6615             25

[134 rows x 3 columns]
No. of records with gender 0 in cluster 25 is 30
No. of records with gender 1 in cluster 25 is 38
No. of records with gender 2 in cluster 25 is 66

Records found in cluster 26 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
6903     2.0             1.0000             26
7336     1.0             0.6624             26
7531     2.0             1.0000             26
7620     1.0             0.6549             26
8113     2.0             0.6675             26
8116     2.0             0.6611             26
8178     2.0             1.0000             26
8204     2.0             0.6746             26
8272     2.0             1.0000             26
8338     0.0             1.0000             26
8356     1.0             0.6517             26
8402     2.0             0.6767             26
8520     2.0             0.6820             26
8546     1.0             1.0000             26
8580     2.0             1.0000             26
8679     2.0             1.0000             26
8688     2.0             0.3354             26
8732     2.0             0.6946             26
8783     2.0             1.0000             26
8854     0.0             1.0000             26
8940     2.0             0.6815             26
8954     2.0             1.0000             26
8965     2.0             1.0000             26
9123     2.0             1.0000             26
9130     2.0             0.6741             26
9207     2.0             1.0000             26
9212     0.0             1.0000             26
9217     2.0             0.3376             26
9228     0.0             1.0000             26
9323     1.0             1.0000             26
No. of records with gender 0 in cluster 26 is 4
No. of records with gender 1 in cluster 26 is 5
No. of records with gender 2 in cluster 26 is 21

Records found in cluster 27 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
7289      0.0             1.0000             27
12796     1.0             1.0000             27
13303     1.0             1.0000             27
13417     1.0             1.0000             27
13502     1.0             1.0000             27
13716     1.0             0.6830             27
13901     2.0             0.6611             27
14140     0.0             0.6645             27
14214     2.0             1.0000             27
14269     2.0             0.6868             27
14337     1.0             1.0000             27
14412     1.0             1.0000             27
14483     0.0             1.0000             27
14645     1.0             1.0000             27
15443     2.0             1.0000             27
15534     0.0             1.0000             27
15807     0.0             1.0000             27
15916     1.0             1.0000             27
16188     1.0             1.0000             27
16418     2.0             1.0000             27
16672     1.0             1.0000             27
16725     1.0             1.0000             27
17269     0.0             1.0000             27
17351     1.0             0.6556             27
17442     1.0             1.0000             27
17842     0.0             1.0000             27
18412     2.0             0.6690             27
18510     1.0             1.0000             27
18731     1.0             1.0000             27
18738     2.0             1.0000             27
No. of records with gender 0 in cluster 27 is 7
No. of records with gender 1 in cluster 27 is 16
No. of records with gender 2 in cluster 27 is 7

Records found in cluster 28 from DBSCAN in Exp 1
       gender  gender:confidence  Cluster_Label
7381      2.0             1.0000             28
7470      1.0             0.6810             28
7542      0.0             1.0000             28
7616      2.0             0.6675             28
7675      2.0             1.0000             28
7744      2.0             0.6761             28
7795      1.0             0.6602             28
7871      2.0             1.0000             28
7946      1.0             1.0000             28
8010      1.0             1.0000             28
8069      1.0             1.0000             28
8125      1.0             1.0000             28
8180      1.0             0.6850             28
8253      2.0             1.0000             28
8395      1.0             1.0000             28
8477      1.0             1.0000             28
8532      1.0             1.0000             28
8587      2.0             1.0000             28
8657      1.0             1.0000             28
8755      0.0             0.6707             28
8810      0.0             1.0000             28
8906      1.0             0.7047             28
8977      1.0             1.0000             28
9039      1.0             1.0000             28
9101      0.0             0.3496             28
9172      0.0             1.0000             28
9247      2.0             0.6622             28
9317      0.0             1.0000             28
17122     2.0             0.6583             28
No. of records with gender 0 in cluster 28 is 6
No. of records with gender 1 in cluster 28 is 14
No. of records with gender 2 in cluster 28 is 9

Records found in cluster 29 from DBSCAN in Exp 1
      gender  gender:confidence  Cluster_Label
7434     2.0             1.0000             29
7662     0.0             1.0000             29
7811     2.0             0.6341             29
7910     2.0             1.0000             29
8401     0.0             0.6732             29
8489     0.0             1.0000             29
8535     2.0             1.0000             29
8583     0.0             1.0000             29
8623     2.0             0.6778             29
8647     2.0             1.0000             29
8925     0.0             1.0000             29
8930     2.0             1.0000             29
9001     1.0             1.0000             29
9076     2.0             1.0000             29
9089     1.0             1.0000             29
9118     2.0             0.6712             29
9166     2.0             1.0000             29
9280     1.0             1.0000             29
No. of records with gender 0 in cluster 29 is 5
No. of records with gender 1 in cluster 29 is 3
No. of records with gender 2 in cluster 29 is 10
Records classified as noise
       gender  gender:confidence  Cluster_Label
941       2.0             0.6582             -1
1367      1.0             1.0000             -1
2135      2.0             1.0000             -1
2382      1.0             1.0000             -1
2897      2.0             1.0000             -1
...       ...                ...            ...
18272     0.0             0.6686             -1
18399     0.0             1.0000             -1
18527     1.0             1.0000             -1
18646     0.0             1.0000             -1
18759     0.0             0.6386             -1

[128 rows x 3 columns]

==================================================
EXP 2: USING ONLY NUMERICAL AND CATEGORICAL FEATURES
==================================================

Data with Only Numerical and Categorical Features
<class 'pandas.core.frame.DataFrame'>
Index: 19970 entries, 0 to 18833
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   retweets_per_day        18836 non-null  float64
 1   favorites_per_day       18836 non-null  float64
 2   tweets_per_day          18836 non-null  float64
 3   profile_created_year    18836 non-null  float64
 4   tweet_created_year      18836 non-null  float64
 5   tweet_location_encoded  18836 non-null  float64
 6   user_timezone_encoded   18836 non-null  float64
 7   gender                  18836 non-null  float64
 8   gender:confidence       18836 non-null  float64
dtypes: float64(9)
memory usage: 1.5 MB
None

Removing NaN values...
Dropping gender and gender:confidence...

Dataset for Exp 2
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   retweets_per_day        17702 non-null  float64
 1   favorites_per_day       17702 non-null  float64
 2   tweets_per_day          17702 non-null  float64
 3   profile_created_year    17702 non-null  float64
 4   tweet_created_year      17702 non-null  float64
 5   tweet_location_encoded  17702 non-null  float64
 6   user_timezone_encoded   17702 non-null  float64
dtypes: float64(7)
memory usage: 1.1 MB
None

   retweets_per_day  favorites_per_day  tweets_per_day  profile_created_year  \
0         -0.100504          -0.318862        1.467473              0.497680   
1         -0.100504          -0.313380       -0.582881              0.028171   
2          9.949874           0.438028       -0.593854              0.967189   
3         -0.100504          -0.306100       -0.691861             -1.380358   
4         -0.100504           3.133544       -0.075028              0.967189   

   tweet_created_year  tweet_location_encoded  user_timezone_encoded  
0                 0.0                0.000053               0.001699  
1                 0.0                0.363294               0.127309  
2                 0.0                0.000053               0.002071  
3                 0.0                0.000159               0.105755  
4                 0.0                0.363294               0.381344  
Applying UMAP for dim reduction...
[I 2024-09-19 17:49:07,472] A new study created in memory with name: no-name-ededf1db-d8b9-424d-a4b0-6fb01985e602
(17702, 3)

Performing K-Means Clustering...
[I 2024-09-19 17:49:11,001] Trial 0 finished with value: 0.3505964259796568 and parameters: {'n_clusters': 8, 'init': 'k-means++'}. Best is trial 0 with value: 0.3505964259796568.
[I 2024-09-19 17:49:14,567] Trial 1 finished with value: 0.35147759105495685 and parameters: {'n_clusters': 9, 'init': 'k-means++'}. Best is trial 1 with value: 0.35147759105495685.
[I 2024-09-19 17:49:18,508] Trial 2 finished with value: 0.37002444364194625 and parameters: {'n_clusters': 3, 'init': 'k-means++'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:22,259] Trial 3 finished with value: 0.35071003394577077 and parameters: {'n_clusters': 8, 'init': 'random'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:26,124] Trial 4 finished with value: 0.35071003394577077 and parameters: {'n_clusters': 8, 'init': 'random'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:29,770] Trial 5 finished with value: 0.27190640524695253 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:33,550] Trial 6 finished with value: 0.35279956412054736 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 2 with value: 0.37002444364194625.
[I 2024-09-19 17:49:37,429] Trial 7 finished with value: 0.4024977317416951 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 7 with value: 0.4024977317416951.
[I 2024-09-19 17:49:41,117] Trial 8 finished with value: 0.4278633474913973 and parameters: {'n_clusters': 5, 'init': 'random'}. Best is trial 8 with value: 0.4278633474913973.
[I 2024-09-19 17:49:44,821] Trial 9 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:48,424] Trial 10 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:52,022] Trial 11 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:55,618] Trial 12 finished with value: 0.4280682750014589 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:49:59,260] Trial 13 finished with value: 0.33270855690372014 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
[I 2024-09-19 17:50:02,878] Trial 14 finished with value: 0.4274023684329269 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 9 with value: 0.4280682750014589.
Best params: {'n_clusters': 6, 'init': 'k-means++'}
No description has been provided for this image
No description has been provided for this image
[I 2024-09-19 17:50:07,251] A new study created in memory with name: no-name-195f08bf-a772-410f-b5da-70f8ab50a69d
The Silhouette score is 0.4280682750014589
The Callinski index is 8777.448715016866

Dataset with Labels from KMeans in Exp 2
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              4
3     0.0             1.0000              1
4     1.0             1.0000              2

Records found in cluster 0 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
7         0.0             1.0000              0
8         1.0             1.0000              0
11        2.0             1.0000              0
...       ...                ...            ...
18828     1.0             0.3460              0
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18835     0.0             0.6772              0

[9067 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 2483
No. of records with gender 1 in cluster 0 is 3062
No. of records with gender 2 in cluster 0 is 3522

Records found in cluster 1 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
3         0.0             1.0000              1
5         1.0             1.0000              1
6         2.0             1.0000              1
9         1.0             1.0000              1
10        2.0             0.7002              1
...       ...                ...            ...
18811     2.0             1.0000              1
18813     0.0             1.0000              1
18814     0.0             1.0000              1
18817     2.0             0.6579              1
18821     1.0             1.0000              1

[6958 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 2818
No. of records with gender 1 in cluster 1 is 2591
No. of records with gender 2 in cluster 1 is 1549

Records found in cluster 2 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
4         1.0             1.0000              2
62        1.0             1.0000              2
87        1.0             1.0000              2
101       0.0             1.0000              2
106       1.0             1.0000              2
...       ...                ...            ...
18683     1.0             1.0000              2
18696     1.0             0.6644              2
18788     2.0             0.3429              2
18807     0.0             1.0000              2
18834     1.0             1.0000              2

[712 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 263
No. of records with gender 1 in cluster 2 is 333
No. of records with gender 2 in cluster 2 is 116

Records found in cluster 3 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
13        2.0             1.0000              3
34        2.0             1.0000              3
59        2.0             0.6694              3
65        0.0             0.6539              3
69        1.0             0.6738              3
...       ...                ...            ...
18659     1.0             1.0000              3
18661     2.0             1.0000              3
18680     0.0             1.0000              3
18693     1.0             0.6553              3
18763     2.0             1.0000              3

[557 rows x 3 columns]
No. of records with gender 0 in cluster 3 is 128
No. of records with gender 1 in cluster 3 is 102
No. of records with gender 2 in cluster 3 is 327

Records found in cluster 4 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
2         0.0             0.6625              4
286       2.0             1.0000              4
392       2.0             0.6576              4
429       1.0             1.0000              4
633       2.0             1.0000              4
...       ...                ...            ...
18071     2.0             1.0000              4
18108     0.0             1.0000              4
18502     2.0             1.0000              4
18516     2.0             1.0000              4
18649     0.0             1.0000              4

[162 rows x 3 columns]
No. of records with gender 0 in cluster 4 is 57
No. of records with gender 1 in cluster 4 is 42
No. of records with gender 2 in cluster 4 is 63

Records found in cluster 5 from KMeans in Exp 2
       gender  gender:confidence  Cluster_Label
257       1.0             1.0000              5
306       1.0             1.0000              5
308       1.0             0.6752              5
1540      0.0             1.0000              5
1622      1.0             1.0000              5
...       ...                ...            ...
18407     2.0             1.0000              5
18720     0.0             1.0000              5
18765     1.0             1.0000              5
18784     2.0             1.0000              5
18796     0.0             0.6760              5

[246 rows x 3 columns]
No. of records with gender 0 in cluster 5 is 94
No. of records with gender 1 in cluster 5 is 71
No. of records with gender 2 in cluster 5 is 81

Performing DBSCAN Clustering...
[I 2024-09-19 17:50:17,407] Trial 0 finished with value: 0.7435771107611636 and parameters: {'eps': 1.6875605206990094, 'min_samples': 9}. Best is trial 0 with value: 0.7435771107611636.
[I 2024-09-19 17:50:27,746] Trial 1 finished with value: 0.755250571224431 and parameters: {'eps': 1.9630802278917843, 'min_samples': 11}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:34,083] Trial 2 finished with value: 0.3476584528395313 and parameters: {'eps': 0.3823838274275766, 'min_samples': 7}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:42,247] Trial 3 finished with value: 0.517654092876453 and parameters: {'eps': 0.7134123414087624, 'min_samples': 12}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:51,986] Trial 4 finished with value: 0.7205824902178928 and parameters: {'eps': 1.4568916045764555, 'min_samples': 19}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:50:56,780] Trial 5 finished with value: -0.18883320225862038 and parameters: {'eps': 0.19229954783245615, 'min_samples': 19}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:06,671] Trial 6 finished with value: 0.7543665225775109 and parameters: {'eps': 1.761574322432801, 'min_samples': 7}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:12,117] Trial 7 finished with value: -0.04795422555050775 and parameters: {'eps': 0.2792160603274616, 'min_samples': 17}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:21,561] Trial 8 finished with value: 0.5299892203558405 and parameters: {'eps': 1.3258984479955693, 'min_samples': 5}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:25,994] Trial 9 finished with value: -0.4879587554963607 and parameters: {'eps': 0.11341970084515758, 'min_samples': 12}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:36,042] Trial 10 finished with value: 0.7543453657820123 and parameters: {'eps': 1.9639453920910408, 'min_samples': 14}. Best is trial 1 with value: 0.755250571224431.
[I 2024-09-19 17:51:46,203] Trial 11 finished with value: 0.7632826665730776 and parameters: {'eps': 1.9652082616592303, 'min_samples': 4}. Best is trial 11 with value: 0.7632826665730776.
[I 2024-09-19 17:51:56,339] Trial 12 finished with value: 0.7695151102549678 and parameters: {'eps': 1.9976508214248325, 'min_samples': 3}. Best is trial 12 with value: 0.7695151102549678.
[I 2024-09-19 17:52:05,529] Trial 13 finished with value: 0.5035289768021715 and parameters: {'eps': 1.0518221952011264, 'min_samples': 3}. Best is trial 12 with value: 0.7695151102549678.
[I 2024-09-19 17:52:15,252] Trial 14 finished with value: 0.7060846213740725 and parameters: {'eps': 1.4972565559855149, 'min_samples': 3}. Best is trial 12 with value: 0.7695151102549678.
Found best params: {'eps': 1.9976508214248325, 'min_samples': 3}
No description has been provided for this image
The Silhouette score is 0.7695151102549678
The Callinski index is 182.57715113799554

Dataset with Labels from DBSCAN in Exp 2
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from DBSCAN in Exp 2
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[17690 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5836
No. of records with gender 1 in cluster 0 is 6198
No. of records with gender 2 in cluster 0 is 5656
Records classified as noise
       gender  gender:confidence  Cluster_Label
2502      0.0             0.6785             -1
3301      0.0             1.0000             -1
5613      1.0             1.0000             -1
6722      1.0             1.0000             -1
7666      2.0             1.0000             -1
10926     0.0             0.6513             -1
12504     0.0             1.0000             -1
12668     0.0             1.0000             -1
13331     1.0             1.0000             -1
15940     0.0             1.0000             -1
17960     0.0             1.0000             -1
18763     2.0             1.0000             -1

==================================================
EXP 3: USING ONLY TEXT FEATURES
==================================================
Dataset for Exp 3
<class 'pandas.core.frame.DataFrame'>
Index: 17702 entries, 0 to 18835
Columns: 3000 entries, desc_0 to text_1499
dtypes: float64(3000)
memory usage: 405.3 MB
None

   desc_0  desc_1  desc_2    desc_3  desc_4  desc_5  desc_6  desc_7  desc_8  \
0     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0 -0.142028     0.0     0.0     0.0     0.0     0.0   

   desc_9  ...  text_1490  text_1491  text_1492  text_1493  text_1494  \
0     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
1     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
2     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
3     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   
4     0.0  ...  -0.142855        0.0        0.0        0.0        0.0   

   text_1495  text_1496  text_1497  text_1498  text_1499  
0  -0.142733  -0.100504        0.0        0.0        0.0  
1  -0.142733  -0.100504        0.0        0.0        0.0  
2  -0.142733  -0.100504        0.0        0.0        0.0  
3  -0.142733  -0.100504        0.0        0.0        0.0  
4  -0.142733  -0.100504        0.0        0.0        0.0  

[5 rows x 3000 columns]
Applying UMAP for dim reduction...
[I 2024-09-19 17:55:22,898] A new study created in memory with name: no-name-909a06f1-95c6-488e-b35e-549634c3f8ed
Performing K-Means Clustering...
[I 2024-09-19 17:55:28,056] Trial 0 finished with value: 0.3474574387073517 and parameters: {'n_clusters': 7, 'init': 'k-means++'}. Best is trial 0 with value: 0.3474574387073517.
[I 2024-09-19 17:55:33,433] Trial 1 finished with value: 0.7000453472137451 and parameters: {'n_clusters': 4, 'init': 'random'}. Best is trial 1 with value: 0.7000453472137451.
[I 2024-09-19 17:55:38,578] Trial 2 finished with value: 0.41475844383239746 and parameters: {'n_clusters': 5, 'init': 'k-means++'}. Best is trial 1 with value: 0.7000453472137451.
[I 2024-09-19 17:55:43,540] Trial 3 finished with value: 0.34986206889152527 and parameters: {'n_clusters': 9, 'init': 'random'}. Best is trial 1 with value: 0.7000453472137451.
[I 2024-09-19 17:55:48,953] Trial 4 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:55:53,991] Trial 5 finished with value: 0.32672926783561707 and parameters: {'n_clusters': 10, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:55:59,151] Trial 6 finished with value: 0.3999803364276886 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:04,084] Trial 7 finished with value: 0.35995355248451233 and parameters: {'n_clusters': 6, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:09,149] Trial 8 finished with value: 0.4177650809288025 and parameters: {'n_clusters': 6, 'init': 'random'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:14,677] Trial 9 finished with value: 0.7177287340164185 and parameters: {'n_clusters': 3, 'init': 'random'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:20,202] Trial 10 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:25,524] Trial 11 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:30,964] Trial 12 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:36,305] Trial 13 finished with value: 0.7358670234680176 and parameters: {'n_clusters': 2, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
[I 2024-09-19 17:56:41,355] Trial 14 finished with value: 0.3999803364276886 and parameters: {'n_clusters': 4, 'init': 'k-means++'}. Best is trial 4 with value: 0.7358670234680176.
Best params: {'n_clusters': 2, 'init': 'k-means++'}
No description has been provided for this image
No description has been provided for this image
The Silhouette score is 0.7358670234680176
The Callinski index is 7837.85693359375

Dataset with Labels from KMeans in Exp 3
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from KMeans in Exp 3
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[16759 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5545
No. of records with gender 1 in cluster 0 is 5925
No. of records with gender 2 in cluster 0 is 5289

Records found in cluster 1 from KMeans in Exp 3
[I 2024-09-19 17:56:48,430] A new study created in memory with name: no-name-4c26d09f-cf9c-40a1-b2bd-9e5ad70b1499
       gender  gender:confidence  Cluster_Label
230       1.0             0.6755              1
264       0.0             1.0000              1
282       1.0             1.0000              1
431       0.0             0.6631              1
502       0.0             1.0000              1
...       ...                ...            ...
18609     1.0             1.0000              1
18646     0.0             1.0000              1
18759     0.0             0.6386              1
18789     0.0             1.0000              1
18803     1.0             1.0000              1

[943 rows x 3 columns]
No. of records with gender 0 in cluster 1 is 298
No. of records with gender 1 in cluster 1 is 276
No. of records with gender 2 in cluster 1 is 369

Performing DBSCAN Clustering...
[I 2024-09-19 17:56:54,626] Trial 0 finished with value: 0.39434999227523804 and parameters: {'eps': 0.9001288568044092, 'min_samples': 9}. Best is trial 0 with value: 0.39434999227523804.
[I 2024-09-19 17:57:01,530] Trial 1 finished with value: 0.6305991411209106 and parameters: {'eps': 1.6892734071090372, 'min_samples': 17}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:07,375] Trial 2 finished with value: 0.4701833426952362 and parameters: {'eps': 0.7833320007467396, 'min_samples': 19}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:12,834] Trial 3 finished with value: 0.08904338628053665 and parameters: {'eps': 0.20500303205257642, 'min_samples': 14}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:18,723] Trial 4 finished with value: 0.500934362411499 and parameters: {'eps': 0.7954866903955463, 'min_samples': 13}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:25,434] Trial 5 finished with value: 0.6164038777351379 and parameters: {'eps': 1.560253475900042, 'min_samples': 16}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:31,698] Trial 6 finished with value: 0.5461991429328918 and parameters: {'eps': 1.1036293500557977, 'min_samples': 4}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:37,306] Trial 7 finished with value: 0.42453014850616455 and parameters: {'eps': 0.41359736485670484, 'min_samples': 15}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:43,423] Trial 8 finished with value: 0.5508587956428528 and parameters: {'eps': 0.9630808694485326, 'min_samples': 20}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:49,415] Trial 9 finished with value: 0.476026713848114 and parameters: {'eps': 0.8700905877932096, 'min_samples': 17}. Best is trial 1 with value: 0.6305991411209106.
[I 2024-09-19 17:57:56,401] Trial 10 finished with value: 0.6314875483512878 and parameters: {'eps': 1.8598038150118554, 'min_samples': 10}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:03,379] Trial 11 finished with value: 0.631306529045105 and parameters: {'eps': 1.9086572095048915, 'min_samples': 9}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:10,505] Trial 12 finished with value: 0.6309650540351868 and parameters: {'eps': 1.9971290436396727, 'min_samples': 9}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:17,431] Trial 13 finished with value: 0.6314875483512878 and parameters: {'eps': 1.844756920307563, 'min_samples': 9}. Best is trial 10 with value: 0.6314875483512878.
[I 2024-09-19 17:58:24,052] Trial 14 finished with value: 0.5369197726249695 and parameters: {'eps': 1.3935522371671678, 'min_samples': 5}. Best is trial 10 with value: 0.6314875483512878.
Found best params: {'eps': 1.8598038150118554, 'min_samples': 10}
No description has been provided for this image
The Silhouette score is 0.6314875483512878
The Callinski index is 1509.1162109375

Dataset with Labels from DBSCAN in Exp 3
   gender  gender:confidence  Cluster_Label
0     0.0             1.0000              0
1     0.0             1.0000              0
2     0.0             0.6625              0
3     0.0             1.0000              0
4     1.0             1.0000              0

Records found in cluster 0 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
0         0.0             1.0000              0
1         0.0             1.0000              0
2         0.0             0.6625              0
3         0.0             1.0000              0
4         1.0             1.0000              0
...       ...                ...            ...
18829     1.0             1.0000              0
18831     0.0             0.6466              0
18832     1.0             1.0000              0
18834     1.0             1.0000              0
18835     0.0             0.6772              0

[15997 rows x 3 columns]
No. of records with gender 0 in cluster 0 is 5341
No. of records with gender 1 in cluster 0 is 5683
No. of records with gender 2 in cluster 0 is 4973

Records found in cluster 1 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
42       2.0             1.0000              1
190      2.0             0.6780              1
252      2.0             1.0000              1
255      1.0             1.0000              1
328      1.0             1.0000              1
382      2.0             0.6753              1
836      1.0             1.0000              1
838      2.0             0.6857              1
980      2.0             1.0000              1
1011     1.0             1.0000              1
1102     1.0             0.6777              1
1276     2.0             1.0000              1
1439     2.0             1.0000              1
1690     2.0             1.0000              1
1702     0.0             1.0000              1
1814     1.0             0.3467              1
1923     0.0             1.0000              1
1938     2.0             1.0000              1
1943     1.0             1.0000              1
2062     0.0             1.0000              1
2141     1.0             1.0000              1
2159     1.0             1.0000              1
2392     2.0             1.0000              1
2420     0.0             1.0000              1
2591     0.0             0.6706              1
2660     0.0             0.3478              1
2856     0.0             1.0000              1
2893     2.0             1.0000              1
2973     0.0             0.6839              1
3034     0.0             0.6673              1
3147     1.0             1.0000              1
3184     0.0             0.6763              1
3326     0.0             1.0000              1
3384     0.0             0.6872              1
3487     1.0             1.0000              1
3799     1.0             1.0000              1
4237     1.0             1.0000              1
4502     0.0             1.0000              1
4550     2.0             1.0000              1
4704     0.0             0.6655              1
4850     0.0             1.0000              1
4913     1.0             1.0000              1
5023     0.0             1.0000              1
5139     2.0             0.6684              1
5727     2.0             1.0000              1
6326     2.0             0.6690              1
6339     0.0             1.0000              1
6525     2.0             0.6797              1
6813     2.0             1.0000              1
6874     0.0             1.0000              1
7582     2.0             0.6667              1
7867     0.0             1.0000              1
7995     2.0             1.0000              1
8096     2.0             1.0000              1
8262     2.0             1.0000              1
8347     2.0             1.0000              1
8459     0.0             0.6652              1
9204     1.0             1.0000              1
9211     0.0             1.0000              1
9264     1.0             1.0000              1
No. of records with gender 0 in cluster 1 is 21
No. of records with gender 1 in cluster 1 is 16
No. of records with gender 2 in cluster 1 is 23

Records found in cluster 2 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
211       2.0             1.0000              2
1594      2.0             0.6983              2
3203      0.0             1.0000              2
4433      1.0             1.0000              2
9376      0.0             1.0000              2
...       ...                ...            ...
18546     1.0             1.0000              2
18573     0.0             1.0000              2
18584     1.0             1.0000              2
18624     1.0             1.0000              2
18656     1.0             1.0000              2

[88 rows x 3 columns]
No. of records with gender 0 in cluster 2 is 23
No. of records with gender 1 in cluster 2 is 36
No. of records with gender 2 in cluster 2 is 29

Records found in cluster 3 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
230       1.0             0.6755              3
264       0.0             1.0000              3
1582      1.0             1.0000              3
3133      0.0             1.0000              3
3292      0.0             0.6814              3
3301      0.0             1.0000              3
3484      0.0             1.0000              3
4219      0.0             1.0000              3
4224      0.0             1.0000              3
4226      2.0             1.0000              3
4253      2.0             1.0000              3
4269      0.0             1.0000              3
4283      1.0             1.0000              3
4298      1.0             0.6539              3
4319      0.0             1.0000              3
4344      2.0             1.0000              3
4356      2.0             0.6709              3
4367      0.0             1.0000              3
4370      0.0             1.0000              3
4381      1.0             0.6719              3
4392      0.0             0.6567              3
4396      0.0             1.0000              3
4426      2.0             0.6838              3
4432      0.0             1.0000              3
4440      0.0             1.0000              3
4444      0.0             0.6422              3
4457      1.0             1.0000              3
4489      1.0             1.0000              3
4506      0.0             1.0000              3
4510      2.0             1.0000              3
4536      2.0             1.0000              3
4558      2.0             0.6866              3
4560      0.0             1.0000              3
4572      1.0             1.0000              3
4584      0.0             1.0000              3
4590      2.0             1.0000              3
4595      1.0             1.0000              3
10146     0.0             0.6757              3
10314     1.0             1.0000              3
10582     2.0             0.6383              3
10622     1.0             0.6692              3
11062     0.0             1.0000              3
11175     0.0             1.0000              3
11817     2.0             1.0000              3
12671     1.0             1.0000              3
12711     0.0             0.6667              3
12771     1.0             0.6677              3
13288     1.0             1.0000              3
13556     2.0             0.6581              3
13780     2.0             1.0000              3
14213     2.0             0.6692              3
14346     1.0             0.6710              3
14750     1.0             1.0000              3
15743     0.0             1.0000              3
15816     1.0             1.0000              3
17504     0.0             0.6567              3
18083     0.0             1.0000              3
18609     1.0             1.0000              3
No. of records with gender 0 in cluster 3 is 26
No. of records with gender 1 in cluster 3 is 18
No. of records with gender 2 in cluster 3 is 14

Records found in cluster 4 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
282      1.0             1.0000              4
502      0.0             1.0000              4
578      1.0             1.0000              4
644      0.0             1.0000              4
771      0.0             1.0000              4
963      2.0             1.0000              4
1433     1.0             1.0000              4
1881     0.0             0.6691              4
2762     2.0             0.6670              4
2903     1.0             0.6763              4
2929     0.0             1.0000              4
3229     0.0             1.0000              4
3308     0.0             0.3364              4
3353     0.0             1.0000              4
3681     2.0             1.0000              4
3770     0.0             1.0000              4
3830     0.0             1.0000              4
4305     1.0             1.0000              4
5040     0.0             1.0000              4
5367     0.0             1.0000              4
5479     0.0             0.6857              4
5634     2.0             0.6840              4
5742     0.0             1.0000              4
6460     2.0             1.0000              4
6862     1.0             1.0000              4
8397     2.0             0.6634              4
8516     2.0             0.6839              4
8918     2.0             1.0000              4
No. of records with gender 0 in cluster 4 is 14
No. of records with gender 1 in cluster 4 is 6
No. of records with gender 2 in cluster 4 is 8

Records found in cluster 5 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
431       0.0             0.6631              5
3276      1.0             1.0000              5
4374      2.0             1.0000              5
4456      1.0             1.0000              5
4653      2.0             1.0000              5
4995      2.0             1.0000              5
5220      2.0             0.6650              5
5372      2.0             1.0000              5
5749      2.0             1.0000              5
6043      2.0             0.6787              5
6172      2.0             1.0000              5
6208      1.0             0.6543              5
6496      2.0             0.6716              5
6669      0.0             1.0000              5
7060      1.0             0.6890              5
7261      0.0             1.0000              5
7439      0.0             1.0000              5
7683      1.0             0.6699              5
7902      0.0             1.0000              5
8120      1.0             1.0000              5
8360      2.0             0.6854              5
8408      0.0             1.0000              5
9100      0.0             1.0000              5
9333      1.0             1.0000              5
10448     2.0             0.6544              5
10820     0.0             0.6635              5
11056     1.0             1.0000              5
12961     1.0             1.0000              5
13252     1.0             1.0000              5
14102     0.0             1.0000              5
14844     0.0             1.0000              5
15017     1.0             1.0000              5
No. of records with gender 0 in cluster 5 is 10
No. of records with gender 1 in cluster 5 is 11
No. of records with gender 2 in cluster 5 is 11

Records found in cluster 6 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
444       0.0             1.0000              6
1112      2.0             1.0000              6
5901      2.0             1.0000              6
5902      0.0             0.6462              6
5904      0.0             1.0000              6
5910      0.0             0.6787              6
5914      2.0             1.0000              6
5930      0.0             0.6512              6
5932      0.0             1.0000              6
5934      2.0             1.0000              6
5935      2.0             1.0000              6
5936      2.0             0.6836              6
5945      2.0             1.0000              6
5952      0.0             1.0000              6
5954      0.0             1.0000              6
5956      2.0             1.0000              6
5961      2.0             1.0000              6
5962      1.0             1.0000              6
5963      0.0             1.0000              6
5964      1.0             1.0000              6
5965      2.0             0.6764              6
5966      2.0             0.6842              6
5973      2.0             0.6509              6
5986      0.0             1.0000              6
5989      2.0             1.0000              6
5990      0.0             0.6713              6
10357     1.0             1.0000              6
11202     0.0             1.0000              6
13236     0.0             1.0000              6
13487     0.0             1.0000              6
14898     1.0             1.0000              6
15100     0.0             0.6715              6
15296     0.0             1.0000              6
16532     1.0             1.0000              6
16536     0.0             0.6770              6
17155     1.0             1.0000              6
No. of records with gender 0 in cluster 6 is 17
No. of records with gender 1 in cluster 6 is 6
No. of records with gender 2 in cluster 6 is 13

Records found in cluster 7 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
575      0.0             1.0000              7
1308     0.0             0.6479              7
2033     1.0             1.0000              7
2308     1.0             0.6774              7
3898     0.0             1.0000              7
5454     2.0             0.6774              7
5539     1.0             1.0000              7
5628     2.0             1.0000              7
5825     1.0             1.0000              7
5847     2.0             0.6717              7
6012     0.0             1.0000              7
6048     2.0             0.6796              7
6114     1.0             0.6620              7
6335     2.0             1.0000              7
6382     2.0             0.6842              7
6417     2.0             1.0000              7
7843     2.0             1.0000              7
8181     0.0             1.0000              7
8355     2.0             0.6778              7
8738     0.0             1.0000              7
No. of records with gender 0 in cluster 7 is 6
No. of records with gender 1 in cluster 7 is 5
No. of records with gender 2 in cluster 7 is 9

Records found in cluster 8 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
805       2.0             1.0000              8
1520      0.0             1.0000              8
2701      0.0             1.0000              8
4906      2.0             0.6681              8
4908      0.0             1.0000              8
4909      2.0             1.0000              8
4910      0.0             1.0000              8
4912      1.0             1.0000              8
4917      1.0             0.6571              8
4918      0.0             1.0000              8
4923      2.0             1.0000              8
4924      2.0             0.6585              8
4929      1.0             1.0000              8
4934      1.0             0.6571              8
4937      2.0             1.0000              8
4944      1.0             0.6711              8
4949      2.0             1.0000              8
4950      1.0             1.0000              8
4951      1.0             1.0000              8
4961      0.0             1.0000              8
4962      1.0             1.0000              8
4965      2.0             0.6695              8
4967      0.0             1.0000              8
4968      1.0             1.0000              8
4970      0.0             1.0000              8
4973      1.0             1.0000              8
4990      1.0             1.0000              8
4997      2.0             0.6957              8
4999      2.0             0.6884              8
8976      1.0             1.0000              8
15005     0.0             1.0000              8
15181     0.0             0.6875              8
18661     2.0             1.0000              8
No. of records with gender 0 in cluster 8 is 10
No. of records with gender 1 in cluster 8 is 12
No. of records with gender 2 in cluster 8 is 11

Records found in cluster 9 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1203      1.0             1.0000              9
1240      1.0             0.6889              9
2115      0.0             1.0000              9
2381      0.0             1.0000              9
3988      2.0             1.0000              9
5994      2.0             0.6611              9
7988      1.0             0.6734              9
8071      1.0             1.0000              9
10735     0.0             1.0000              9
10738     0.0             1.0000              9
11076     2.0             1.0000              9
11179     2.0             1.0000              9
11484     1.0             1.0000              9
11648     1.0             1.0000              9
11746     0.0             1.0000              9
12054     1.0             1.0000              9
13078     0.0             1.0000              9
14056     2.0             1.0000              9
15064     0.0             0.6534              9
15751     1.0             1.0000              9
15757     1.0             1.0000              9
16465     0.0             1.0000              9
16868     1.0             1.0000              9
17448     0.0             1.0000              9
18208     0.0             1.0000              9
18753     0.0             0.6678              9
No. of records with gender 0 in cluster 9 is 11
No. of records with gender 1 in cluster 9 is 10
No. of records with gender 2 in cluster 9 is 5

Records found in cluster 10 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
1273     0.0             1.0000             10
1605     2.0             1.0000             10
1761     2.0             1.0000             10
1845     1.0             1.0000             10
1987     1.0             1.0000             10
2274     0.0             1.0000             10
3961     0.0             1.0000             10
4092     0.0             0.3411             10
4424     2.0             1.0000             10
5218     2.0             1.0000             10
5336     1.0             1.0000             10
5445     0.0             1.0000             10
6262     2.0             1.0000             10
6289     1.0             1.0000             10
7003     1.0             1.0000             10
7118     2.0             1.0000             10
7431     1.0             1.0000             10
7540     0.0             0.6859             10
7791     1.0             1.0000             10
8142     2.0             1.0000             10
8601     2.0             0.6700             10
8693     0.0             1.0000             10
9023     1.0             0.6654             10
9265     1.0             1.0000             10
No. of records with gender 0 in cluster 10 is 7
No. of records with gender 1 in cluster 10 is 9
No. of records with gender 2 in cluster 10 is 8

Records found in cluster 11 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1301      2.0             1.0000             11
1666      0.0             1.0000             11
2010      0.0             1.0000             11
2876      2.0             0.6741             11
3238      1.0             1.0000             11
4359      1.0             1.0000             11
5200      0.0             1.0000             11
5203      1.0             1.0000             11
5205      1.0             0.6748             11
5209      1.0             1.0000             11
5211      0.0             0.6738             11
5217      0.0             1.0000             11
5227      1.0             1.0000             11
5232      1.0             1.0000             11
5234      1.0             1.0000             11
5242      1.0             1.0000             11
5256      2.0             0.6475             11
5262      0.0             0.6457             11
5264      0.0             1.0000             11
5265      1.0             1.0000             11
5266      0.0             1.0000             11
5270      2.0             1.0000             11
5271      2.0             0.6812             11
5272      2.0             1.0000             11
5284      1.0             0.6815             11
5289      0.0             1.0000             11
5291      2.0             0.6333             11
5297      0.0             1.0000             11
7900      2.0             1.0000             11
7908      2.0             1.0000             11
7910      2.0             1.0000             11
7914      2.0             1.0000             11
7933      1.0             1.0000             11
7953      0.0             1.0000             11
7956      1.0             1.0000             11
7958      1.0             1.0000             11
7959      0.0             0.6823             11
7963      2.0             1.0000             11
7964      2.0             1.0000             11
7966      0.0             0.6607             11
7967      2.0             0.6737             11
7968      2.0             1.0000             11
7973      0.0             1.0000             11
7975      0.0             1.0000             11
7976      0.0             1.0000             11
7977      2.0             0.6739             11
7980      2.0             1.0000             11
7987      0.0             1.0000             11
7991      1.0             1.0000             11
7999      2.0             0.6726             11
10908     0.0             1.0000             11
11615     2.0             1.0000             11
12253     1.0             1.0000             11
12766     2.0             0.3547             11
13202     2.0             1.0000             11
15562     0.0             1.0000             11
16542     1.0             1.0000             11
No. of records with gender 0 in cluster 11 is 19
No. of records with gender 1 in cluster 11 is 17
No. of records with gender 2 in cluster 11 is 21

Records found in cluster 12 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1303      1.0             1.0000             12
1365      1.0             1.0000             12
5694      2.0             1.0000             12
8923      2.0             1.0000             12
8925      0.0             1.0000             12
8927      1.0             1.0000             12
8930      2.0             1.0000             12
8940      2.0             0.6815             12
8943      2.0             1.0000             12
8944      2.0             0.6641             12
8945      0.0             1.0000             12
8947      2.0             1.0000             12
8948      2.0             1.0000             12
8951      0.0             0.6752             12
8952      1.0             0.6734             12
8953      1.0             1.0000             12
8954      2.0             1.0000             12
8965      2.0             1.0000             12
8971      1.0             1.0000             12
8981      1.0             1.0000             12
8987      2.0             1.0000             12
8988      0.0             1.0000             12
8989      1.0             1.0000             12
8990      2.0             1.0000             12
8991      2.0             0.6728             12
8995      2.0             0.6761             12
8997      0.0             1.0000             12
9395      1.0             1.0000             12
9792      2.0             0.6642             12
11130     1.0             1.0000             12
11659     0.0             1.0000             12
13220     2.0             1.0000             12
14625     0.0             1.0000             12
15940     0.0             1.0000             12
17978     2.0             1.0000             12
No. of records with gender 0 in cluster 12 is 8
No. of records with gender 1 in cluster 12 is 10
No. of records with gender 2 in cluster 12 is 17

Records found in cluster 13 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1458      2.0             1.0000             13
7860      2.0             0.6321             13
8313      0.0             1.0000             13
8322      1.0             1.0000             13
8327      0.0             0.6763             13
8331      2.0             0.6716             13
8333      2.0             1.0000             13
8337      1.0             1.0000             13
8338      0.0             1.0000             13
8339      0.0             1.0000             13
8340      2.0             0.6707             13
8341      1.0             0.6699             13
8353      2.0             0.6650             13
8356      1.0             0.6517             13
8358      2.0             0.6965             13
8384      0.0             1.0000             13
8385      1.0             1.0000             13
8391      0.0             1.0000             13
12693     0.0             1.0000             13
12899     2.0             1.0000             13
15029     1.0             1.0000             13
No. of records with gender 0 in cluster 13 is 7
No. of records with gender 1 in cluster 13 is 6
No. of records with gender 2 in cluster 13 is 8

Records found in cluster 14 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1474      1.0             0.3390             14
2740      0.0             1.0000             14
4804      2.0             0.6691             14
4811      0.0             1.0000             14
4817      0.0             0.3384             14
...       ...                ...            ...
13842     1.0             1.0000             14
14718     0.0             1.0000             14
16342     1.0             1.0000             14
16883     1.0             1.0000             14
17182     0.0             1.0000             14

[123 rows x 3 columns]
No. of records with gender 0 in cluster 14 is 36
No. of records with gender 1 in cluster 14 is 29
No. of records with gender 2 in cluster 14 is 58

Records found in cluster 15 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
1580     1.0             1.0000             15
9206     2.0             0.3398             15
9207     2.0             1.0000             15
9212     0.0             1.0000             15
9215     1.0             0.6818             15
9216     2.0             0.6519             15
9217     2.0             0.3376             15
9220     2.0             1.0000             15
9221     2.0             1.0000             15
9225     2.0             1.0000             15
9228     0.0             1.0000             15
9243     0.0             0.3506             15
9249     1.0             0.3542             15
9253     2.0             1.0000             15
9278     1.0             1.0000             15
9280     1.0             1.0000             15
9283     2.0             0.6659             15
9289     2.0             1.0000             15
9293     0.0             1.0000             15
9294     0.0             1.0000             15
9904     0.0             1.0000             15
No. of records with gender 0 in cluster 15 is 6
No. of records with gender 1 in cluster 15 is 5
No. of records with gender 2 in cluster 15 is 10

Records found in cluster 16 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
1897      1.0             0.6483             16
8401      0.0             0.6732             16
8402      2.0             0.6767             16
8403      2.0             0.6575             16
8407      0.0             0.6763             16
8411      1.0             1.0000             16
8412      1.0             0.6900             16
8429      1.0             1.0000             16
8460      2.0             0.6828             16
8466      0.0             1.0000             16
8470      1.0             1.0000             16
8478      0.0             1.0000             16
8479      2.0             0.3625             16
8487      0.0             0.6806             16
8489      0.0             1.0000             16
8496      0.0             1.0000             16
12914     1.0             1.0000             16
No. of records with gender 0 in cluster 16 is 7
No. of records with gender 1 in cluster 16 is 6
No. of records with gender 2 in cluster 16 is 4

Records found in cluster 17 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
1940     2.0             0.6675             17
7703     1.0             1.0000             17
7705     1.0             1.0000             17
7727     2.0             1.0000             17
7738     2.0             1.0000             17
7743     0.0             1.0000             17
7745     1.0             1.0000             17
7746     2.0             1.0000             17
7747     2.0             0.6745             17
7748     2.0             1.0000             17
7751     2.0             1.0000             17
7752     1.0             0.6649             17
7757     2.0             1.0000             17
7759     2.0             1.0000             17
7760     2.0             1.0000             17
7761     1.0             1.0000             17
7793     0.0             0.6691             17
7797     2.0             0.6600             17
No. of records with gender 0 in cluster 17 is 2
No. of records with gender 1 in cluster 17 is 5
No. of records with gender 2 in cluster 17 is 11

Records found in cluster 18 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2135      2.0             1.0000             18
3581      0.0             1.0000             18
3705      2.0             0.6581             18
3809      2.0             1.0000             18
3906      1.0             0.6422             18
...       ...                ...            ...
18531     2.0             1.0000             18
18646     0.0             1.0000             18
18759     0.0             0.6386             18
18789     0.0             1.0000             18
18803     1.0             1.0000             18

[109 rows x 3 columns]
No. of records with gender 0 in cluster 18 is 38
No. of records with gender 1 in cluster 18 is 32
No. of records with gender 2 in cluster 18 is 39

Records found in cluster 19 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
2138      1.0             1.0000             19
2145      0.0             1.0000             19
2146      1.0             1.0000             19
2147      1.0             1.0000             19
2148      1.0             0.3576             19
2156      0.0             1.0000             19
2166      1.0             1.0000             19
2168      0.0             0.6825             19
2169      1.0             1.0000             19
2171      1.0             1.0000             19
2172      0.0             1.0000             19
2182      2.0             1.0000             19
2185      0.0             1.0000             19
2186      0.0             0.3403             19
2187      1.0             1.0000             19
2188      2.0             0.6812             19
2189      0.0             0.6582             19
2191      0.0             1.0000             19
2194      1.0             1.0000             19
2196      1.0             1.0000             19
2204      1.0             0.6587             19
2205      0.0             0.6685             19
2206      1.0             0.6551             19
2207      1.0             1.0000             19
2210      1.0             1.0000             19
2216      1.0             0.6896             19
2217      1.0             0.6832             19
2220      1.0             1.0000             19
2223      2.0             1.0000             19
5916      2.0             0.6935             19
9793      0.0             0.6664             19
11047     1.0             1.0000             19
No. of records with gender 0 in cluster 19 is 10
No. of records with gender 1 in cluster 19 is 18
No. of records with gender 2 in cluster 19 is 4

Records found in cluster 20 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
3252      0.0             1.0000             20
8701      1.0             1.0000             20
8711      2.0             1.0000             20
8728      0.0             1.0000             20
8732      2.0             0.6946             20
8739      0.0             1.0000             20
8744      2.0             1.0000             20
8746      2.0             0.6916             20
8764      2.0             0.6674             20
8765      1.0             0.6611             20
8767      0.0             1.0000             20
8769      2.0             1.0000             20
8772      0.0             0.6732             20
8777      0.0             1.0000             20
8779      2.0             1.0000             20
8782      1.0             1.0000             20
8783      2.0             1.0000             20
8784      2.0             1.0000             20
11222     1.0             1.0000             20
16945     0.0             1.0000             20
No. of records with gender 0 in cluster 20 is 7
No. of records with gender 1 in cluster 20 is 4
No. of records with gender 2 in cluster 20 is 9

Records found in cluster 21 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
3316      0.0             1.0000             21
7600      0.0             1.0000             21
7601      2.0             0.6609             21
7611      0.0             0.6666             21
7613      2.0             1.0000             21
7614      2.0             0.6866             21
7615      2.0             1.0000             21
7620      1.0             0.6549             21
7621      1.0             1.0000             21
7622      2.0             1.0000             21
7626      0.0             1.0000             21
7627      0.0             0.7037             21
7629      2.0             1.0000             21
7652      0.0             0.6772             21
7655      1.0             1.0000             21
7662      0.0             1.0000             21
7665      2.0             0.6832             21
7667      0.0             1.0000             21
7669      2.0             1.0000             21
7670      1.0             1.0000             21
7672      2.0             1.0000             21
7679      1.0             1.0000             21
7680      1.0             1.0000             21
7681      1.0             1.0000             21
7686      2.0             1.0000             21
7694      2.0             1.0000             21
7697      1.0             1.0000             21
12196     0.0             0.7049             21
13766     1.0             1.0000             21
14354     0.0             1.0000             21
No. of records with gender 0 in cluster 21 is 10
No. of records with gender 1 in cluster 21 is 9
No. of records with gender 2 in cluster 21 is 11

Records found in cluster 22 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
3744      0.0             0.6440             22
3927      0.0             1.0000             22
3994      1.0             1.0000             22
4057      2.0             0.3516             22
4300      2.0             0.6736             22
...       ...                ...            ...
12397     0.0             1.0000             22
12507     2.0             1.0000             22
12659     2.0             1.0000             22
12754     2.0             0.6615             22
14756     1.0             1.0000             22

[75 rows x 3 columns]
No. of records with gender 0 in cluster 22 is 18
No. of records with gender 1 in cluster 22 is 21
No. of records with gender 2 in cluster 22 is 36

Records found in cluster 23 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
4547     2.0             1.0000             23
7804     1.0             1.0000             23
7810     2.0             1.0000             23
7811     2.0             0.6341             23
7817     0.0             1.0000             23
7819     0.0             1.0000             23
7820     0.0             1.0000             23
7821     2.0             1.0000             23
7822     2.0             1.0000             23
7824     2.0             1.0000             23
7825     0.0             1.0000             23
7827     2.0             0.3472             23
7830     1.0             1.0000             23
7882     0.0             1.0000             23
7888     2.0             0.6506             23
7890     2.0             1.0000             23
7892     2.0             1.0000             23
7897     0.0             0.6803             23
7899     1.0             1.0000             23
8203     2.0             1.0000             23
8204     2.0             0.6746             23
8208     2.0             0.6844             23
8236     0.0             1.0000             23
8246     2.0             0.6598             23
8247     1.0             1.0000             23
8250     1.0             1.0000             23
8251     0.0             0.6624             23
8261     1.0             1.0000             23
8264     0.0             1.0000             23
8269     0.0             0.6774             23
8272     2.0             1.0000             23
8284     2.0             0.6691             23
No. of records with gender 0 in cluster 23 is 10
No. of records with gender 1 in cluster 23 is 6
No. of records with gender 2 in cluster 23 is 16

Records found in cluster 24 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
4606      0.0             1.0000             24
4608      0.0             0.6618             24
4615      2.0             0.6590             24
4621      1.0             1.0000             24
4627      2.0             1.0000             24
4643      0.0             1.0000             24
4657      2.0             0.6751             24
4664      1.0             1.0000             24
4674      2.0             1.0000             24
4675      2.0             1.0000             24
4685      2.0             1.0000             24
4690      0.0             0.6763             24
4691      0.0             1.0000             24
4710      2.0             1.0000             24
4712      0.0             1.0000             24
4717      2.0             1.0000             24
4720      2.0             1.0000             24
4722      2.0             0.6686             24
4731      1.0             1.0000             24
4743      2.0             1.0000             24
4746      1.0             1.0000             24
4772      2.0             1.0000             24
4778      1.0             0.3592             24
4780      2.0             1.0000             24
4781      2.0             0.6475             24
4782      1.0             0.6697             24
4783      2.0             1.0000             24
4785      2.0             0.6811             24
4789      2.0             1.0000             24
4790      1.0             1.0000             24
4798      2.0             0.6736             24
4799      0.0             1.0000             24
6627      2.0             1.0000             24
6629      1.0             1.0000             24
6633      0.0             1.0000             24
6650      2.0             1.0000             24
6654      1.0             1.0000             24
6660      2.0             1.0000             24
6664      2.0             1.0000             24
6665      1.0             1.0000             24
6668      0.0             1.0000             24
6670      0.0             1.0000             24
6678      1.0             1.0000             24
6685      2.0             1.0000             24
6688      2.0             1.0000             24
11370     2.0             1.0000             24
13222     2.0             1.0000             24
No. of records with gender 0 in cluster 24 is 10
No. of records with gender 1 in cluster 24 is 11
No. of records with gender 2 in cluster 24 is 26

Records found in cluster 25 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
5013     1.0             1.0000             25
5567     2.0             1.0000             25
8109     1.0             1.0000             25
8112     0.0             1.0000             25
8113     2.0             0.6675             25
8116     2.0             0.6611             25
8118     1.0             1.0000             25
8122     2.0             0.6623             25
8123     2.0             0.6605             25
8128     0.0             1.0000             25
8132     2.0             0.6665             25
8146     1.0             1.0000             25
8159     2.0             1.0000             25
8165     0.0             1.0000             25
8176     1.0             1.0000             25
8177     2.0             1.0000             25
8178     2.0             1.0000             25
8185     2.0             1.0000             25
8190     2.0             0.6735             25
8191     1.0             0.3568             25
8192     2.0             0.6726             25
8199     2.0             1.0000             25
No. of records with gender 0 in cluster 25 is 3
No. of records with gender 1 in cluster 25 is 6
No. of records with gender 2 in cluster 25 is 13

Records found in cluster 26 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5206      1.0             1.0000             26
5629      2.0             1.0000             26
5640      0.0             1.0000             26
5944      1.0             1.0000             26
6093      1.0             0.6653             26
6157      2.0             0.6567             26
6174      2.0             0.6619             26
6409      0.0             1.0000             26
6514      1.0             1.0000             26
7289      0.0             1.0000             26
10812     1.0             0.6827             26
12073     1.0             1.0000             26
12796     1.0             1.0000             26
13106     1.0             0.6574             26
13303     1.0             1.0000             26
13417     1.0             1.0000             26
13502     1.0             1.0000             26
13716     1.0             0.6830             26
13901     2.0             0.6611             26
14140     0.0             0.6645             26
14214     2.0             1.0000             26
14269     2.0             0.6868             26
14337     1.0             1.0000             26
14412     1.0             1.0000             26
14483     0.0             1.0000             26
14645     1.0             1.0000             26
14855     2.0             1.0000             26
15443     2.0             1.0000             26
15534     0.0             1.0000             26
15807     0.0             1.0000             26
15916     1.0             1.0000             26
15950     2.0             1.0000             26
16188     1.0             1.0000             26
16418     2.0             1.0000             26
16672     1.0             1.0000             26
16725     1.0             1.0000             26
16854     2.0             1.0000             26
17269     0.0             1.0000             26
17351     1.0             0.6556             26
17442     1.0             1.0000             26
17842     0.0             1.0000             26
18302     1.0             1.0000             26
18412     2.0             0.6690             26
18510     1.0             1.0000             26
18731     1.0             1.0000             26
18738     2.0             1.0000             26
No. of records with gender 0 in cluster 26 is 9
No. of records with gender 1 in cluster 26 is 24
No. of records with gender 2 in cluster 26 is 13

Records found in cluster 27 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5400      0.0             1.0000             27
5401      2.0             0.6836             27
5407      2.0             0.6785             27
5408      2.0             1.0000             27
5409      0.0             1.0000             27
5412      2.0             1.0000             27
5427      1.0             1.0000             27
5429      0.0             1.0000             27
5433      2.0             0.6736             27
5434      1.0             1.0000             27
5436      2.0             0.6602             27
5442      1.0             0.3409             27
5443      2.0             0.6483             27
5447      1.0             1.0000             27
5448      2.0             0.6654             27
5449      1.0             1.0000             27
5456      0.0             1.0000             27
5457      2.0             0.6468             27
5466      2.0             1.0000             27
5470      2.0             1.0000             27
5471      0.0             1.0000             27
5472      0.0             1.0000             27
5480      1.0             1.0000             27
5485      2.0             1.0000             27
5486      1.0             1.0000             27
5487      1.0             0.6669             27
5490      2.0             1.0000             27
5491      2.0             1.0000             27
5635      2.0             1.0000             27
6074      0.0             1.0000             27
13614     1.0             1.0000             27
No. of records with gender 0 in cluster 27 is 7
No. of records with gender 1 in cluster 27 is 9
No. of records with gender 2 in cluster 27 is 15

Records found in cluster 28 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
5506     2.0             0.6595             28
5511     1.0             1.0000             28
5524     0.0             0.6722             28
5541     0.0             1.0000             28
5542     2.0             1.0000             28
5544     1.0             0.3374             28
5546     1.0             1.0000             28
5552     2.0             1.0000             28
5558     2.0             1.0000             28
5559     2.0             0.6745             28
5560     1.0             1.0000             28
5561     0.0             1.0000             28
5563     2.0             1.0000             28
5564     2.0             1.0000             28
5566     1.0             0.6607             28
5570     2.0             1.0000             28
5572     1.0             1.0000             28
5579     1.0             1.0000             28
5583     2.0             1.0000             28
5588     0.0             0.6795             28
5597     0.0             1.0000             28
5598     0.0             1.0000             28
No. of records with gender 0 in cluster 28 is 6
No. of records with gender 1 in cluster 28 is 7
No. of records with gender 2 in cluster 28 is 9

Records found in cluster 29 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5605      1.0             1.0000             29
5611      2.0             0.6856             29
5616      2.0             1.0000             29
5625      0.0             1.0000             29
5626      2.0             0.6589             29
5632      2.0             0.6651             29
5643      0.0             1.0000             29
5644      1.0             0.6725             29
5661      2.0             1.0000             29
5665      1.0             1.0000             29
5669      2.0             1.0000             29
5670      1.0             0.6752             29
5671      2.0             0.3424             29
5672      2.0             1.0000             29
5673      0.0             0.6761             29
5674      1.0             1.0000             29
5675      2.0             1.0000             29
5679      2.0             0.6816             29
5681      1.0             1.0000             29
5683      2.0             1.0000             29
5685      0.0             1.0000             29
5686      2.0             1.0000             29
5687      2.0             0.6799             29
5689      2.0             0.6805             29
5696      0.0             1.0000             29
5697      0.0             0.6892             29
18237     1.0             1.0000             29
No. of records with gender 0 in cluster 29 is 6
No. of records with gender 1 in cluster 29 is 7
No. of records with gender 2 in cluster 29 is 14

Records found in cluster 30 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5705      1.0             1.0000             30
5709      2.0             0.6860             30
5711      2.0             1.0000             30
5712      1.0             1.0000             30
5726      2.0             0.6735             30
5746      2.0             0.3410             30
5752      2.0             0.6747             30
5754      1.0             1.0000             30
5757      1.0             1.0000             30
5766      2.0             1.0000             30
5767      2.0             1.0000             30
5768      1.0             0.3631             30
5770      2.0             1.0000             30
5773      2.0             0.6769             30
5777      2.0             0.6638             30
5782      1.0             1.0000             30
5786      2.0             1.0000             30
5790      0.0             1.0000             30
5792      2.0             1.0000             30
5793      2.0             0.6675             30
5794      2.0             1.0000             30
5798      2.0             1.0000             30
10884     0.0             0.6712             30
11215     2.0             1.0000             30
No. of records with gender 0 in cluster 30 is 2
No. of records with gender 1 in cluster 30 is 6
No. of records with gender 2 in cluster 30 is 16

Records found in cluster 31 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
5800      1.0             1.0000             31
5807      1.0             1.0000             31
5809      0.0             1.0000             31
5810      1.0             1.0000             31
5819      2.0             0.6667             31
5835      2.0             1.0000             31
5838      2.0             1.0000             31
5841      2.0             0.6645             31
5843      0.0             0.6658             31
5846      2.0             1.0000             31
5849      0.0             0.6792             31
5861      2.0             0.6808             31
5862      0.0             1.0000             31
5868      1.0             1.0000             31
5869      1.0             1.0000             31
5870      0.0             0.3441             31
5877      1.0             1.0000             31
5881      2.0             1.0000             31
5883      2.0             0.6725             31
5885      2.0             0.6640             31
5894      1.0             1.0000             31
5898      2.0             0.6675             31
11156     1.0             1.0000             31
12450     1.0             1.0000             31
13833     0.0             0.6955             31
No. of records with gender 0 in cluster 31 is 6
No. of records with gender 1 in cluster 31 is 9
No. of records with gender 2 in cluster 31 is 10

Records found in cluster 32 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
6101      1.0             0.6543             32
6102      0.0             0.6699             32
6103      0.0             1.0000             32
6109      0.0             1.0000             32
6129      2.0             0.6778             32
6131      0.0             1.0000             32
6133      0.0             0.6655             32
6134      0.0             1.0000             32
6147      2.0             0.6540             32
6149      0.0             1.0000             32
6151      2.0             0.6642             32
6156      2.0             1.0000             32
6158      1.0             1.0000             32
6164      1.0             1.0000             32
6167      2.0             0.6742             32
6169      2.0             0.6866             32
6178      1.0             1.0000             32
6180      1.0             1.0000             32
6190      0.0             1.0000             32
6192      2.0             0.6652             32
6197      1.0             0.6513             32
15331     0.0             0.6709             32
No. of records with gender 0 in cluster 32 is 9
No. of records with gender 1 in cluster 32 is 6
No. of records with gender 2 in cluster 32 is 7

Records found in cluster 33 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
6301      2.0             1.0000             33
6302      2.0             1.0000             33
6309      1.0             0.3750             33
6311      2.0             1.0000             33
6318      1.0             1.0000             33
6319      0.0             0.6471             33
6327      2.0             0.6733             33
6332      0.0             1.0000             33
6358      2.0             0.6692             33
6366      2.0             0.6662             33
6373      2.0             1.0000             33
6374      2.0             1.0000             33
6378      0.0             1.0000             33
6381      1.0             1.0000             33
6383      2.0             0.6754             33
6389      0.0             1.0000             33
6390      1.0             1.0000             33
6391      1.0             1.0000             33
6393      2.0             1.0000             33
6397      1.0             1.0000             33
6398      2.0             1.0000             33
6399      1.0             1.0000             33
10210     2.0             0.6588             33
16505     1.0             1.0000             33
18762     1.0             1.0000             33
No. of records with gender 0 in cluster 33 is 4
No. of records with gender 1 in cluster 33 is 9
No. of records with gender 2 in cluster 33 is 12

Records found in cluster 34 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
6502      0.0             1.0000             34
6505      2.0             1.0000             34
6516      0.0             1.0000             34
6521      2.0             1.0000             34
6523      1.0             1.0000             34
...       ...                ...            ...
16234     2.0             0.6937             34
16385     1.0             1.0000             34
17421     0.0             1.0000             34
18026     0.0             1.0000             34
18443     2.0             1.0000             34

[70 rows x 3 columns]
No. of records with gender 0 in cluster 34 is 19
No. of records with gender 1 in cluster 34 is 16
No. of records with gender 2 in cluster 34 is 35

Records found in cluster 35 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
6722      1.0             1.0000             35
6726      0.0             1.0000             35
6728      2.0             0.6634             35
6730      2.0             0.6681             35
6732      1.0             0.6882             35
6742      2.0             0.6625             35
6758      0.0             0.3469             35
6759      1.0             0.6543             35
6772      2.0             1.0000             35
6786      2.0             0.6694             35
6787      2.0             1.0000             35
6788      2.0             1.0000             35
6789      2.0             1.0000             35
6793      1.0             0.6699             35
6795      2.0             0.6741             35
14387     1.0             1.0000             35
16986     0.0             1.0000             35
No. of records with gender 0 in cluster 35 is 3
No. of records with gender 1 in cluster 35 is 5
No. of records with gender 2 in cluster 35 is 9

Records found in cluster 36 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7002     2.0             1.0000             36
7016     0.0             1.0000             36
7017     2.0             0.6646             36
7033     1.0             1.0000             36
7040     1.0             1.0000             36
7043     0.0             1.0000             36
7048     2.0             1.0000             36
7052     2.0             0.6595             36
7053     2.0             1.0000             36
7058     1.0             1.0000             36
7062     0.0             1.0000             36
7065     2.0             1.0000             36
7087     2.0             0.6671             36
7091     1.0             0.6642             36
7095     2.0             1.0000             36
7096     2.0             0.6782             36
7097     2.0             0.6788             36
No. of records with gender 0 in cluster 36 is 3
No. of records with gender 1 in cluster 36 is 4
No. of records with gender 2 in cluster 36 is 10

Records found in cluster 37 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7101     2.0             1.0000             37
7102     0.0             1.0000             37
7105     0.0             1.0000             37
7109     1.0             1.0000             37
7113     2.0             0.6718             37
7115     0.0             0.3451             37
7123     0.0             1.0000             37
7128     2.0             0.6585             37
7130     2.0             1.0000             37
7136     1.0             0.6835             37
7148     0.0             0.6750             37
7153     1.0             1.0000             37
7158     1.0             1.0000             37
7162     1.0             1.0000             37
7166     2.0             0.6635             37
7176     1.0             1.0000             37
7184     2.0             1.0000             37
No. of records with gender 0 in cluster 37 is 5
No. of records with gender 1 in cluster 37 is 6
No. of records with gender 2 in cluster 37 is 6

Records found in cluster 38 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7210     0.0             0.6617             38
7215     2.0             1.0000             38
7216     2.0             0.6921             38
7228     2.0             0.6766             38
7230     1.0             1.0000             38
7234     0.0             1.0000             38
7250     2.0             1.0000             38
7258     1.0             0.6902             38
7259     0.0             1.0000             38
7260     2.0             1.0000             38
7266     2.0             1.0000             38
7273     1.0             1.0000             38
7277     0.0             0.3487             38
7284     0.0             0.6661             38
7288     2.0             1.0000             38
7297     2.0             0.6853             38
No. of records with gender 0 in cluster 38 is 5
No. of records with gender 1 in cluster 38 is 3
No. of records with gender 2 in cluster 38 is 8

Records found in cluster 39 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7381     2.0             1.0000             39
7470     1.0             0.6810             39
7542     0.0             1.0000             39
7871     2.0             1.0000             39
7946     1.0             1.0000             39
8253     2.0             1.0000             39
8477     1.0             1.0000             39
8657     1.0             1.0000             39
8755     0.0             0.6707             39
8810     0.0             1.0000             39
9039     1.0             1.0000             39
9247     2.0             0.6622             39
9317     0.0             1.0000             39
No. of records with gender 0 in cluster 39 is 4
No. of records with gender 1 in cluster 39 is 5
No. of records with gender 2 in cluster 39 is 4

Records found in cluster 40 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
7500     1.0             1.0000             40
7502     1.0             0.6617             40
7505     0.0             1.0000             40
7507     0.0             0.6848             40
7508     0.0             1.0000             40
7509     1.0             1.0000             40
7510     0.0             1.0000             40
7511     2.0             1.0000             40
7512     1.0             0.6739             40
7513     0.0             1.0000             40
7524     1.0             1.0000             40
7531     2.0             1.0000             40
7532     2.0             1.0000             40
7534     2.0             1.0000             40
7581     1.0             1.0000             40
7586     0.0             1.0000             40
7593     2.0             1.0000             40
7596     0.0             1.0000             40
7598     2.0             1.0000             40
No. of records with gender 0 in cluster 40 is 7
No. of records with gender 1 in cluster 40 is 6
No. of records with gender 2 in cluster 40 is 6

Records found in cluster 41 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
7616      2.0             0.6675             41
7675      2.0             1.0000             41
7744      2.0             0.6761             41
7795      1.0             0.6602             41
8010      1.0             1.0000             41
8069      1.0             1.0000             41
8125      1.0             1.0000             41
8180      1.0             0.6850             41
8395      1.0             1.0000             41
8532      1.0             1.0000             41
8587      2.0             1.0000             41
8906      1.0             0.7047             41
8977      1.0             1.0000             41
9101      0.0             0.3496             41
9172      0.0             1.0000             41
10038     0.0             1.0000             41
17122     2.0             0.6583             41
No. of records with gender 0 in cluster 41 is 3
No. of records with gender 1 in cluster 41 is 9
No. of records with gender 2 in cluster 41 is 5

Records found in cluster 42 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
8024      2.0             1.0000             42
8033      0.0             0.6701             42
8039      1.0             1.0000             42
8046      2.0             1.0000             42
8050      2.0             1.0000             42
8052      0.0             0.7050             42
8055      0.0             1.0000             42
8057      1.0             1.0000             42
8058      2.0             1.0000             42
8059      1.0             1.0000             42
8062      1.0             1.0000             42
8063      1.0             1.0000             42
8065      1.0             0.6688             42
8067      2.0             0.3442             42
8068      1.0             1.0000             42
8070      1.0             0.6698             42
8078      0.0             1.0000             42
8081      2.0             1.0000             42
8085      0.0             1.0000             42
8097      0.0             1.0000             42
16912     1.0             0.6483             42
No. of records with gender 0 in cluster 42 is 6
No. of records with gender 1 in cluster 42 is 9
No. of records with gender 2 in cluster 42 is 6

Records found in cluster 43 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
8607      2.0             0.6659             43
8613      2.0             1.0000             43
8616      2.0             1.0000             43
8617      2.0             0.6774             43
8619      0.0             0.6647             43
8620      2.0             0.6975             43
8622      0.0             0.6634             43
8623      2.0             0.6778             43
8624      1.0             1.0000             43
8627      2.0             0.6829             43
8632      2.0             1.0000             43
8638      0.0             1.0000             43
8642      2.0             0.6688             43
8645      2.0             0.6778             43
8647      2.0             1.0000             43
8675      2.0             1.0000             43
8676      1.0             0.6602             43
8677      0.0             0.6772             43
8679      2.0             1.0000             43
8680      2.0             1.0000             43
8681      0.0             0.6507             43
8688      2.0             0.3354             43
8690      2.0             1.0000             43
8691      2.0             0.3595             43
8694      2.0             0.6736             43
8699      0.0             1.0000             43
13069     0.0             1.0000             43
13603     1.0             1.0000             43
15290     2.0             1.0000             43
17358     0.0             1.0000             43
No. of records with gender 0 in cluster 43 is 8
No. of records with gender 1 in cluster 43 is 3
No. of records with gender 2 in cluster 43 is 19

Records found in cluster 44 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
8804      2.0             0.6561             44
8834      2.0             1.0000             44
8843      0.0             0.3571             44
8844      2.0             1.0000             44
8849      0.0             0.6906             44
8852      0.0             1.0000             44
8854      0.0             1.0000             44
8855      1.0             0.6440             44
8859      2.0             1.0000             44
8864      0.0             0.3421             44
8865      1.0             1.0000             44
8873      0.0             1.0000             44
8874      1.0             1.0000             44
8878      2.0             0.6640             44
8881      0.0             1.0000             44
8884      1.0             0.6612             44
8886      2.0             0.3536             44
17100     1.0             1.0000             44
No. of records with gender 0 in cluster 44 is 7
No. of records with gender 1 in cluster 44 is 5
No. of records with gender 2 in cluster 44 is 6

Records found in cluster 45 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
9001      1.0             1.0000             45
9020      0.0             1.0000             45
9028      1.0             0.6849             45
9033      0.0             1.0000             45
9038      1.0             0.6667             45
9043      1.0             1.0000             45
9046      2.0             0.6745             45
9050      1.0             0.6658             45
9052      2.0             0.6826             45
9054      1.0             1.0000             45
9055      1.0             1.0000             45
9056      2.0             1.0000             45
9061      0.0             1.0000             45
9064      2.0             1.0000             45
9069      2.0             0.6595             45
9070      0.0             1.0000             45
9072      1.0             0.6774             45
9076      2.0             1.0000             45
9079      0.0             1.0000             45
9080      1.0             0.6532             45
9081      0.0             1.0000             45
9082      0.0             1.0000             45
9083      0.0             1.0000             45
9089      1.0             1.0000             45
12197     2.0             1.0000             45
13641     0.0             1.0000             45
No. of records with gender 0 in cluster 45 is 9
No. of records with gender 1 in cluster 45 is 10
No. of records with gender 2 in cluster 45 is 7

Records found in cluster 46 from DBSCAN in Exp 3
      gender  gender:confidence  Cluster_Label
9105     2.0             0.6468             46
9109     0.0             0.6553             46
9112     1.0             1.0000             46
9113     0.0             1.0000             46
9115     2.0             0.6771             46
9118     2.0             0.6712             46
9123     2.0             1.0000             46
9125     2.0             1.0000             46
9130     2.0             0.6741             46
9136     2.0             1.0000             46
9144     2.0             1.0000             46
9150     1.0             1.0000             46
9151     1.0             0.6453             46
9152     0.0             1.0000             46
9165     0.0             1.0000             46
9166     2.0             1.0000             46
9178     2.0             0.6698             46
9190     1.0             1.0000             46
9194     2.0             1.0000             46
9195     1.0             1.0000             46
9945     2.0             0.6779             46
No. of records with gender 0 in cluster 46 is 4
No. of records with gender 1 in cluster 46 is 5
No. of records with gender 2 in cluster 46 is 12

Records found in cluster 47 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
9382      2.0             1.0000             47
9398      1.0             1.0000             47
9475      0.0             1.0000             47
9496      0.0             1.0000             47
9511      2.0             0.6634             47
...       ...                ...            ...
15169     1.0             1.0000             47
15207     1.0             1.0000             47
15391     2.0             1.0000             47
15439     2.0             1.0000             47
15622     2.0             1.0000             47

[68 rows x 3 columns]
No. of records with gender 0 in cluster 47 is 18
No. of records with gender 1 in cluster 47 is 24
No. of records with gender 2 in cluster 47 is 26

Records found in cluster 48 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
9648      0.0             1.0000             48
10111     2.0             1.0000             48
10551     2.0             0.6362             48
10903     1.0             1.0000             48
11265     1.0             1.0000             48
11650     0.0             1.0000             48
12295     0.0             1.0000             48
12731     2.0             1.0000             48
15770     0.0             0.6808             48
16201     2.0             1.0000             48
No. of records with gender 0 in cluster 48 is 4
No. of records with gender 1 in cluster 48 is 2
No. of records with gender 2 in cluster 48 is 4

Records found in cluster 49 from DBSCAN in Exp 3
       gender  gender:confidence  Cluster_Label
11119     1.0             1.0000             49
11727     2.0             1.0000             49
12333     1.0             1.0000             49
12992     0.0             1.0000             49
13486     2.0             1.0000             49
14046     0.0             1.0000             49
14958     2.0             1.0000             49
15597     1.0             0.3362             49
16706     0.0             1.0000             49
17186     1.0             1.0000             49
17599     0.0             0.6654             49
18270     0.0             1.0000             49
No. of records with gender 0 in cluster 49 is 5
No. of records with gender 1 in cluster 49 is 4
No. of records with gender 2 in cluster 49 is 3
Records classified as noise
       gender  gender:confidence  Cluster_Label
812       2.0             0.6678             -1
1367      1.0             1.0000             -1
1544      0.0             1.0000             -1
2154      1.0             0.6561             -1
2382      1.0             1.0000             -1
2481      0.0             1.0000             -1
2897      2.0             1.0000             -1
3283      2.0             1.0000             -1
3341      1.0             1.0000             -1
3526      1.0             1.0000             -1
3938      2.0             0.6545             -1
4051      2.0             1.0000             -1
4277      1.0             1.0000             -1
4650      2.0             0.3571             -1
5424      0.0             1.0000             -1
6140      2.0             0.6679             -1
6313      1.0             1.0000             -1
7107      2.0             0.6865             -1
7453      2.0             0.6782             -1
8798      1.0             1.0000             -1
8836      0.0             0.6645             -1
8905      1.0             1.0000             -1
14448     0.0             1.0000             -1
14613     0.0             1.0000             -1
14791     1.0             1.0000             -1
15015     1.0             1.0000             -1
15216     0.0             1.0000             -1

---- VISUALIZE THE METRIC EVALUATION ----
No description has been provided for this image
No description has been provided for this image

REGRESSION¶

In [3]:
# =============================== REGRESSION ======================================
print()
print()
df_preprocessed_reg = df_preprocessed.copy()
y = df_preprocessed["gender:confidence"].reset_index(drop=True)
df_preprocessed_reg = df_preprocessed_reg.drop(['gender', "gender:confidence"], axis=1)

print()
print("=" * 50)
print('Boosted Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed_reg, y, test_size=0.6, random_state=42)
boosted_reg = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
boosted_reg.fit(X_train, y_train)

# Make predictions
y_pred = boosted_reg.predict(X_test)
y_pred_train = boosted_reg.predict(X_train)
y_tot_pred = boosted_reg.predict(df_preprocessed_reg)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

# FEATURE IMPORTANCE
print()
print("Performing feature importance analysis...")
# Find column indices that start with 'desc_' and 'text_'
desc_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('desc_')]
text_columns = [i for i, col in enumerate(df_preprocessed_reg.columns) if col.startswith('text_')]
# Access the corresponding elements from the ndarray using the column indices
desc_array = boosted_reg.feature_importances_[desc_columns]
text_array = boosted_reg.feature_importances_[text_columns]
# Output the results
# print("desc_ column indices:", desc_columns)
# print("text_ column indices:", text_columns)
# print("desc_ array:\n", desc_array)
# print("text_ array:\n", text_array)
# Sum the values for desc_ and text_ columns
desc_sum = np.sum(boosted_reg.feature_importances_[desc_columns])
text_sum = np.sum(boosted_reg.feature_importances_[text_columns])
# Create a new DataFrame
new_data = {}
# Add the 'desc' and 'text' columns with the summed values
new_data['desc'] = [desc_sum]
new_data['text'] = [text_sum]
boosted_reg.feature_importances_
# Add the other feature columns that are not desc_ or text_
other_columns = [i for i in range(len(df_preprocessed_reg.columns)) if i not in desc_columns and i not in text_columns]
for i in other_columns:
    col_name = df_preprocessed_reg.columns[i]
    new_data[col_name] = [boosted_reg.feature_importances_[i]]
# Convert the new_data dictionary to a DataFrame
feature_importance = pd.DataFrame(new_data)
# Output the results
print(feature_importance)

# Plot feature importance
df_melted = feature_importance.melt(var_name='Feature', value_name='Importance in percentage')
df_melted = df_melted.sort_values(ascending=False, by="Importance in percentage")
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=df_melted, palette='viridis')
plt.suptitle('Boosted Regression Tree with Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()


# preprocess dataset for plots with regression results
df_preprocessed_diff = df_preprocessed_reg.copy()
df_preprocessed_diff['difference'] = (y.to_numpy() - y_tot_pred)
df_preprocessed_diff["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_diff["gender:confidence"] = y_reset

# filtering out coloumns that might be false mistaken
misclassified_df_reg = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
misclassified_df = df_preprocessed_diff[(df_preprocessed_diff["difference"] > 0.1) & (df_preprocessed_diff["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train.index)]

# plotting these columns

def scatterplot_mistaken_points(misclassified_df, X_train, model):
    # Edit misclassified_df to include 'in X_train'
    misclassified_df["in X_train"] = misclassified_df.index.isin(X_train.index)
    # Create subsets for the two plots
    df_in_X_train = misclassified_df[misclassified_df["in X_train"]]
    df_not_in_X_train = misclassified_df[~misclassified_df["in X_train"]]
    # Set up the matplotlib figure with subplots
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    # Set the main title
    fig.suptitle(f'{model}\nGender Confidence of "Mistaken" Records', fontsize=16)
    # Plot 1: Points in X_train
    sns.scatterplot(data=df_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[0], color='blue')
    axes[0].plot([df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()],
                 [df_in_X_train['gender:confidence'].min(), df_in_X_train['gender:confidence'].max()], 'k--', lw=2)
    axes[0].set_xlabel('Dataset')
    axes[0].set_ylabel('Predicted')
    axes[0].set_title(f'Training Set\nSample Size: {len(df_in_X_train)}')
    # Plot 2: Points not in X_train
    sns.scatterplot(data=df_not_in_X_train, x='gender:confidence', y='gender_confidence_pred', alpha=0.4, ax=axes[1], color='red')
    axes[1].plot([df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()],
                 [df_not_in_X_train['gender:confidence'].min(), df_not_in_X_train['gender:confidence'].max()], 'k--', lw=2)
    axes[1].set_xlabel('Dataset')
    axes[1].set_ylabel('Predicted')
    axes[1].set_title(f'Not Training Set\nSample Size: {len(df_not_in_X_train)}')
    plt.tight_layout()
    plt.show()

def scatter_plot(y, y_tot_pred, model):
    # Plotting more results results
    plt.figure(figsize=(10, 8))
    plt.scatter(y, y_tot_pred, alpha=0.5)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
    plt.xlabel('Dataset', fontsize=12)
    plt.ylabel('Predicted', fontsize=12)
    plt.suptitle(model, fontsize=16)
    plt.title('Gender Confidence Comparison', fontsize=14)
    plt.show()

scatterplot_mistaken_points(misclassified_df, X_train, "Boosted Regression Tree with Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree with Vectorised Text/Desc Features")

# ==============================analyze without text features=============================================
columns_to_drop = [col for col in df_preprocessed_reg.columns if col.startswith(('desc_', 'text_'))]
df_preprocessed_non_text = df_preprocessed_reg.drop(columns=columns_to_drop)
print(df_preprocessed_non_text)

print()
print("=" * 50)
print('Boosted Regression Tree without Vectorised Text/Desc Features')
print("=" * 50)

boosted_reg_non_text = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
# Split the dataset into training and testing sets
X_train_non_text, X_test_non_text, y_train_non_text, y_test_non_text = train_test_split(df_preprocessed_non_text, y, test_size=0.6, random_state=42)
# Fit the model
boosted_reg_non_text.fit(X_train_non_text, y_train_non_text)
# Make predictions
y_pred = boosted_reg_non_text.predict(X_test_non_text)
y_pred_train = boosted_reg_non_text.predict(X_train_non_text)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test_non_text, y_pred)
mse_train = mean_squared_error(y_train_non_text, y_pred_train)
mse_total = mean_squared_error(y, y_tot_pred)
y_tot_pred = boosted_reg_non_text.predict(df_preprocessed_non_text)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

# Get feature importances and plot from the model
print()
print("Performing feature importance analysis...")
feature_importances = boosted_reg_non_text.feature_importances_
column_names = X_train_non_text.columns
feature_importance_df = pd.DataFrame({
    'Feature': column_names,
    'Importance in percentage': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance in percentage', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance in percentage', y='Feature', data=feature_importance_df, palette='viridis')
plt.suptitle('Boosted Regression Tree without Vectorised Text/Desc Features', fontsize=16)
plt.title('Feature Importance Analysis', fontsize=14)
plt.show()

# adding the dataset gender confidence
df_preprocessed_non_text["gender_confidence_pred"] = y_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_non_text["gender:confidence"] = y_reset

# Inspecting coulumns that could be suspicous
df_preprocessed_non_text["difference"] = y.to_numpy() - y_tot_pred
misclassified_df = df_preprocessed_non_text[(df_preprocessed_non_text["difference"] > 0.1) & (df_preprocessed_non_text["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_non_text.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_non_text.index)]
scatterplot_mistaken_points(misclassified_df, X_train_non_text, "Boosted Regression Tree without Vectorised Text/Desc Features")
scatter_plot(y, y_tot_pred, "Boosted Regression Tree without Vectorised Text/Desc Features")

# ====================================Analyzing with a linear regression (Least Squares Implementation)====================

print()
print("=" * 50)
print('Linear Regression Tree with Vectorised Text/Desc Features')
print("=" * 50)

X_train_lin = sm.add_constant(X_train)
X_test_lin = sm.add_constant(X_test)
df_preprocessed_lin = sm.add_constant(df_preprocessed_reg)
model = sm.OLS(y_train, X_train_lin)  # Ordinary least squares (unregularized)
results = model.fit()

# run predictions
y_lin_pred = results.predict(X_test_lin)
y_lin_tot_pred = results.predict(df_preprocessed_lin)
y_lin_train = results.predict(X_train_lin)

# Evaluate performance using Mean Squared Error
mse_test = mean_squared_error(y_test, y_lin_pred)
mse_total = mean_squared_error(y, y_lin_tot_pred)
mse_train = mean_squared_error(y_train, y_lin_train)

print(f"Mean Squared Error (Train): {mse_train:.4f}")
print(f"Mean Squared Error (Test): {mse_test:.4f}")
print(f"Mean Squared Error (Total): {mse_total:.4f}")

# PLOT MSE
labels = ['Train', 'Test', 'Total']
mse_values = [mse_train, mse_test, mse_total]
plt.figure(figsize=(8, 6))
plt.bar(labels, mse_values, color=['skyblue', 'salmon', 'lightgreen'])
plt.suptitle('Linear Regression Tree with Vectorised Textual Features', fontsize=16)
plt.title('Mean Squared Error Comparison', fontsize=14)
plt.xlabel('Dataset Type')
plt.ylabel('MSE')
plt.show()

# final preprocess
df_preprocessed_lin["difference"] = y.to_numpy() - y_lin_tot_pred
y_reset = y.reset_index(drop=True)
df_preprocessed_lin["gender:confidence"] = y
df_preprocessed_lin["gender_confidence_pred"] = y_lin_tot_pred


# identify mistaken users
misclassified_df = df_preprocessed_lin[(df_preprocessed_lin["difference"] > 0.1) & (df_preprocessed_lin["gender_confidence_pred"] < 0.85)]
non_train_misclassify = misclassified_df[misclassified_df.index.isin(X_train_lin.index)]
train_misclassify = misclassified_df[~misclassified_df.index.isin(X_train_lin.index)]

scatter_plot(y, y_lin_tot_pred, "Linear Regression Tree with Vectorised Text/Desc Features")
scatterplot_mistaken_points(misclassified_df, X_train_lin, "Linear Regression Tree with Vectorised Text/Desc Features")


# ================================Identity final mistaken samples====================================
common_samples = misclassified_df_reg.index.intersection(misclassified_df.index)
common_df = misclassified_df.loc[common_samples]

scatterplot_mistaken_points(common_df, X_train_lin, "Boosted and Linear Regression Trees (Intersection) with Vectorised Text/Desc Features")


==================================================
Boosted Regression Tree with Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0266
Mean Squared Error (Test): 0.0290
Mean Squared Error (Total): 0.0280
No description has been provided for this image
Performing feature importance analysis...
       desc      text  favorites_per_day  retweets_per_day  tweets_per_day  \
0  0.307368  0.365717           0.021232               0.0        0.121167   

   profile_created_year  tweet_created_year    link_R    link_G    link_B  \
0              0.155415                 0.0  0.000336  0.011339  0.000434   

   sidebar_R  sidebar_G  sidebar_B  
0   0.005375   0.006886    0.00473  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
       favorites_per_day  retweets_per_day  tweets_per_day  \
0               0.000000          0.000000       28.156306   
1               0.015557          0.000000        1.709220   
2               2.147921          0.000279        1.567681   
3               0.036214          0.000000        0.303514   
4               9.797322          0.000000        8.259911   
...                  ...               ...             ...   
18831           0.090636          0.000000        0.234994   
18832           0.568938          0.000000        3.061580   
18833           0.011366          0.000000        6.005683   
18834          16.336871          0.000000       12.937933   
18835           0.878510          0.000000        0.766728   

       profile_created_year  tweet_created_year  link_R  link_G  link_B  \
0                      2013                2015       8     194     194   
1                      2012                2015       0     132     180   
2                      2014                2015     171     184     194   
3                      2009                2015       0     132     180   
4                      2014                2015      59     148     217   
...                     ...                 ...     ...     ...     ...   
18831                  2015                2015       0     132     180   
18832                  2012                2015     207     185      41   
18833                  2012                2015       0     132     180   
18834                  2012                2015     146     102     204   
18835                  2014                2015       0     132     180   

       sidebar_R  sidebar_G  sidebar_B  
0            255        255        255  
1            192        222        237  
2            192        222        237  
3            192        222        237  
4              0          0          0  
...          ...        ...        ...  
18831        192        222        237  
18832          0          0          0  
18833        192        222        237  
18834          0          0          0  
18835        192        222        237  

[18836 rows x 11 columns]

==================================================
Boosted Regression Tree without Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0274
Mean Squared Error (Test): 0.0291
Mean Squared Error (Total): 0.0280
No description has been provided for this image
Performing feature importance analysis...
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==================================================
Linear Regression Tree with Vectorised Text/Desc Features
==================================================
Mean Squared Error (Train): 0.0166
Mean Squared Error (Test): 0.0499
Mean Squared Error (Total): 0.0366
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

CLASSIFICATION¶

In [4]:
# ============================== CLASSIFICATION ==============================

print()
print()
print('---- CLASSIFICATION ----')
# Features and target
X = df_preprocessed.drop(columns=['gender'])  # Assuming 'gender' is the target variable
y = df_preprocessed['gender']

# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the performance
print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("\nXGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict
y_pred_lgb = lgb_clf.predict(X_test)

# Evaluation
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb))

# Helper function to plot confusion matrix
def plot_confusion_matrix(y_test, y_pred, model_name):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


# Helper function to extract and display classification report with model name
def get_classification_report(y_test, y_pred, model_name):
    report = classification_report(y_test, y_pred, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df['model'] = model_name
    return df

# Random Forest Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
rf_report = get_classification_report(y_test, y_pred_rf, "Random Forest")

# XGBoost Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
xgb_report = get_classification_report(y_test, y_pred_xgb, "XGBoost")

# LightGBM Confusion Matrix and Classification Report
plot_confusion_matrix(y_test, y_pred_lgb, "LightGBM")
lgb_report = get_classification_report(y_test, y_pred_lgb, "LightGBM")

# Combine all reports
combined_report = pd.concat([rf_report, xgb_report, lgb_report])

# Debugging Step: Check the combined report structure
print("Combined Classification Report:\n", combined_report.head())

# Filter out rows for precision, recall, and f1-score
combined_report_filtered = combined_report[
    combined_report.index.isin(['0', '1'])  # Filter for the classes
].reset_index()

# Debugging Step: Check the filtered report structure
print("Filtered Report for Precision, Recall, and F1-Score:\n", combined_report_filtered.head())

# Plot Precision, Recall, and F1-Score for each model
metrics = ['precision', 'recall', 'f1-score']

for metric in metrics:
    # Debugging Step: Filter for specific metric
    print(f"Data for {metric}:")
    print(combined_report_filtered[['index', metric, 'model']])

    plt.figure(figsize=(10, 6))
    sns.barplot(
        x="index",
        y=metric,
        hue="model",
        data=combined_report_filtered[['index', metric, 'model']]
    )
    plt.title(f'{metric.capitalize()} Comparison')
    plt.ylabel(metric.capitalize())
    plt.xlabel('Class (0 = Human, 1 = Non-Human)')
    plt.show()

# Accuracy comparison
accuracies = {
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'XGBoost': accuracy_score(y_test, y_pred_xgb),
    'LightGBM': accuracy_score(y_test, y_pred_lgb)
}

plt.figure(figsize=(6, 4))
plt.bar(accuracies.keys(), accuracies.values(), color=['blue', 'green', 'red'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()

---- CLASSIFICATION ----
Accuracy Score:  0.6239384288747346
Confusion Matrix:
 [[665 469 133]
 [289 929 100]
 [251 175 757]]
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.52      0.54      1267
           1       0.59      0.70      0.64      1318
           2       0.76      0.64      0.70      1183

    accuracy                           0.62      3768
   macro avg       0.64      0.62      0.63      3768
weighted avg       0.63      0.62      0.62      3768


XGBoost Classifier Report:
              precision    recall  f1-score   support

           0       0.56      0.54      0.55      1267
           1       0.60      0.65      0.62      1318
           2       0.72      0.67      0.69      1183

    accuracy                           0.62      3768
   macro avg       0.62      0.62      0.62      3768
weighted avg       0.62      0.62      0.62      3768

Accuracy: 0.6191613588110403
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36890
[LightGBM] [Info] Number of data points in the train set: 15068, number of used features: 1766
[LightGBM] [Info] Start training from score -1.117843
[LightGBM] [Info] Start training from score -1.029513
[LightGBM] [Info] Start training from score -1.152536
LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      1267
           1       0.61      0.65      0.63      1318
           2       0.72      0.69      0.70      1183

    accuracy                           0.63      3768
   macro avg       0.63      0.63      0.63      3768
weighted avg       0.63      0.63      0.63      3768

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Combined Classification Report:
            precision    recall  f1-score      support          model
0           0.551867  0.524862  0.538026  1267.000000  Random Forest
1           0.590591  0.704856  0.642684  1318.000000  Random Forest
2           0.764646  0.639899  0.696733  1183.000000  Random Forest
accuracy    0.623938  0.623938  0.623938     0.623938  Random Forest
macro avg   0.635702  0.623205  0.625814  3768.000000  Random Forest
Filtered Report for Precision, Recall, and F1-Score:
   index  precision    recall  f1-score  support          model
0     0   0.551867  0.524862  0.538026   1267.0  Random Forest
1     1   0.590591  0.704856  0.642684   1318.0  Random Forest
2     0   0.557096  0.539069  0.547934   1267.0        XGBoost
3     1   0.596540  0.654021  0.623959   1318.0        XGBoost
4     0   0.568994  0.553275  0.561024   1267.0       LightGBM
Data for precision:
  index  precision          model
0     0   0.551867  Random Forest
1     1   0.590591  Random Forest
2     0   0.557096        XGBoost
3     1   0.596540        XGBoost
4     0   0.568994       LightGBM
5     1   0.605674       LightGBM
No description has been provided for this image
Data for recall:
  index    recall          model
0     0  0.524862  Random Forest
1     1  0.704856  Random Forest
2     0  0.539069        XGBoost
3     1  0.654021        XGBoost
4     0  0.553275       LightGBM
5     1  0.647951       LightGBM
No description has been provided for this image
Data for f1-score:
  index  f1-score          model
0     0  0.538026  Random Forest
1     1  0.642684  Random Forest
2     0  0.547934        XGBoost
3     1  0.623959        XGBoost
4     0  0.561024       LightGBM
5     1  0.626100       LightGBM
No description has been provided for this image
No description has been provided for this image

ASSOCIATION RULES¶

In [10]:
# ============================== ASSOCIATION RULES ==============================
print()
print()
print('---- ASSOCIATION RULES ----')
# Binarize numeric columns
df_asso['high_favorites'] = df_asso['favorites_per_day'] > df_asso['favorites_per_day'].mean()
df_asso['high_retweets'] = df_asso['retweets_per_day'] > df_asso['retweets_per_day'].mean()
df_asso['high_tweets'] = df_asso['tweets_per_day'] > df_asso['tweets_per_day'].mean()

# Binarize year columns (profile_created_year and tweet_created_year)
# Example: Set threshold year as 2015
df_asso['profile_recent'] = df_asso['profile_created_year'] >= 2015
df_asso['tweet_recent'] = df_asso['tweet_created_year'] >= 2015

# Select only the binary columns
df_apriori = df_asso[['high_favorites', 'high_retweets', 'high_tweets',
                              'profile_recent', 'tweet_recent',
                              'tweet_location_encoded', 'user_timezone_encoded']]

# Convert all columns to int (0 or 1)
df_apriori = df_apriori.astype(int)

# Apply Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the rules
print(rules)


top_frequent_itemsets = frequent_itemsets.nlargest(10, 'support')

plt.figure(figsize=(10, 6))
sns.barplot(x='support', y='itemsets', data=top_frequent_itemsets)
plt.title('Top 10 Frequent Itemsets by Support')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.show()

# ---------------------------
# Visualization 2: Scatter Plot of Association Rules by Confidence and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(x='confidence', y='lift', size='support', data=rules, hue='antecedents', palette='viridis', sizes=(40, 200))
plt.title('Association Rules: Confidence vs Lift')
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.legend(title='Antecedents', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# ---------------------------
# Visualization 3: Heatmap of Support, Confidence, and Lift
# ---------------------------
plt.figure(figsize=(10, 6))
sns.heatmap(rules[['support', 'confidence', 'lift']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Support, Confidence, and Lift')
plt.show()

---- ASSOCIATION RULES ----
                       antecedents                     consequents  \
0                 (high_favorites)                   (high_tweets)   
1                    (high_tweets)                (high_favorites)   
2                 (high_favorites)                  (tweet_recent)   
3                   (tweet_recent)                (high_favorites)   
4                    (high_tweets)                  (tweet_recent)   
5                   (tweet_recent)                   (high_tweets)   
6                 (profile_recent)                  (tweet_recent)   
7                   (tweet_recent)                (profile_recent)   
8    (high_favorites, high_tweets)                  (tweet_recent)   
9   (high_favorites, tweet_recent)                   (high_tweets)   
10     (high_tweets, tweet_recent)                (high_favorites)   
11                (high_favorites)     (high_tweets, tweet_recent)   
12                   (high_tweets)  (high_favorites, tweet_recent)   
13                  (tweet_recent)   (high_favorites, high_tweets)   

    antecedent support  consequent support   support  confidence     lift  \
0             0.210607            0.271767  0.066097    0.313839  1.15481   
1             0.271767            0.210607  0.066097    0.243212  1.15481   
2             0.210607            1.000000  0.210607    1.000000  1.00000   
3             1.000000            0.210607  0.210607    0.210607  1.00000   
4             0.271767            1.000000  0.271767    1.000000  1.00000   
5             1.000000            0.271767  0.271767    0.271767  1.00000   
6             0.175568            1.000000  0.175568    1.000000  1.00000   
7             1.000000            0.175568  0.175568    0.175568  1.00000   
8             0.066097            1.000000  0.066097    1.000000  1.00000   
9             0.210607            0.271767  0.066097    0.313839  1.15481   
10            0.271767            0.210607  0.066097    0.243212  1.15481   
11            0.210607            0.271767  0.066097    0.313839  1.15481   
12            0.271767            0.210607  0.066097    0.243212  1.15481   
13            1.000000            0.066097  0.066097    0.066097  1.00000   

    leverage  conviction  zhangs_metric  
0   0.008861    1.061316       0.169823  
1   0.008861    1.043082       0.184085  
2   0.000000         inf       0.000000  
3   0.000000    1.000000       0.000000  
4   0.000000         inf       0.000000  
5   0.000000    1.000000       0.000000  
6   0.000000         inf       0.000000  
7   0.000000    1.000000       0.000000  
8   0.000000         inf       0.000000  
9   0.008861    1.061316       0.169823  
10  0.008861    1.043082       0.184085  
11  0.008861    1.061316       0.169823  
12  0.008861    1.043082       0.184085  
13  0.000000    1.000000       0.000000  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image